C程序來替換擴展的ASCII並在cli上測試/打印它們

Question

我目前有以下代碼來查找/替換（通過一個實際上是char數組的char數組的查找列表）擴展的ASCII字符。 替換本身似乎可以很好地工作（盡管有任何改進的提示，總是歡迎），但是在cli上使用它時（Ubuntu 15.04），我得到了奇怪的符號。 現在，如果這是因為我的C代碼不夠好或者我的終端不“知道”如何打印某些字符，我會感到困惑。

-------------- C代碼--------------

/* Include system header files.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>




unsigned char* sanitizeString(unsigned char *pCharArg1)
{
    unsigned char *pCharWorker = pCharArg1;

    /* The look-up map
     */
    unsigned char* charLookup[] = { "ab","àa", "ss", "åa", "ÅA", "ÿy", "XX","" };

    /* For every character in the input string we're going to verify
     * if the character needs to be replaced with one from the look-up
     * map.
     */
    while ( *pCharWorker != '\0' ) { 
        printf( "STARTING NEXT CHAR \n");   
        int finishedFlag = 0;
        //if ( (((int) *pCharWorker >= 65) && ((int) *pCharWorker <= 122)) ) {
            int j = 0;  
            /*
             * Loop the look-up map
             */
            while ((*(charLookup[j]) !='\0') && (finishedFlag == 0)) {
                printf( "Analazying *pCharWorker CHAR : %c \n", *pCharWorker    );
                printf( "Analazying *pCharWorker INT : %d \n", *pCharWorker    );
                printf( "Analazying *(charLookup[j]) CHAR  : %c \n", *(charLookup[j])    );         
                printf( "Analazying *(charLookup[j]) INT : %d \n", *(charLookup[j])    );       
                /* Inspected character matches one from the lookup map,
                 * so fetch the new character and assign it.
                 */
                if( *pCharWorker == *(charLookup[j]) ){
                    printf( "Detected char: %c \n", *pCharWorker   ); 
                    *pCharWorker = *(charLookup[j]+1);
                    printf( "Replaced with char: %c \n", *pCharWorker   ); 
                    finishedFlag = 1;   
                }
                j++;
            }
    //  }    
        printf( "======================= \n"  );             
        pCharWorker++;      
    }
    return pCharArg1;     
}


int main( int argc,  char* argv[] ){
    unsigned char* z = argv[1];
    printf( "PRINT : %s \n",  z ); 
    unsigned char* p2 = sanitizeString( z);
    printf( "Sanitized string: %s \n",  p2 ); 
    return 0;
}

例如在執行時給出：

koen @ beite-f1：〜$ gcc -o san sanitize.c

koen @ beite-f1：〜$ ./sanç

打印：ç

開始下一個字符

分析* pCharWorker CHAR：

...

消毒字符串：

非常感謝您的投入

br，科恩。

Answer 1

您的翻譯失敗，因為在創建charLookup時，某些字符串長於2個字符，因為C將它們編碼為可變長度UTF-8。 您已經有了utf8_string,output_char以十六進制形式轉儲字符串，您將看到。

例如，翻譯重音符號“ a”的第二個元素的十六進制值為

C3 A0 61 00

考慮反轉charLookup中每個元素內的順序。 這樣，您將擁有output_char,utf8_string ，第二個元素變為：

61 C3 A0 00

這樣，您可以稍微修改代碼。 請注意，您需要像pCharInput / pcharOutput中那樣將pCharWorker拆分為源/目標指針。

char *xlat = charLookup[j];
char clean_char = xlat[0];
char *dirty_utf8 = xlat + 1;
int dirty_len = strlen(dirty_utf8);

if (strncmp(pCharInput,dirty_utf8,dirty_len) == 0) {
    *pCharOutput++ = clean_char;
    pCharInput += dirty_len
}
else {
    *pCharOutput++ = *pCharInput++;
}

注意：在函數的底部，您需要*pCharOutput = 0; 以前不需要的

上面只是提供給您想法的一個片段，但是應該易於合並。 注意我做了xlat等。 等 為簡潔起見，定義為defs。 如果願意，可以將它們分解為func頂部的def和循環體中的分配。

您還可以利用以下事實來添加優化：如果char> = 0x80（設置了MSB），則只能在輸入字符串的當前位置使用UTF-8字符串。 然后，您可以跳過通過charLookup的過程。 所以：

// skip charLookup scan if unnecessary
if ((*pCharInput & 0x80) == 0) {
    *pcharOutput++ = *pCharInput++;
    continue;
}

更新：
既然您願意付小費，那么這就是我會做的完整的工作。 請注意，轉換數組應該是全局/靜態的，否則func序言將在每次輸入時重新創建。 另外，strlen / strncmp也是不必要的。 我也改變了循環。

注意：此示例對utf8輸入進行了特殊處理，但在翻譯中找不到，因此請最少看一下。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>

char *xlatlist[] = { "ba","aà", "ss", "aå", "AÅ", "yÿ", "XX", NULL };

// sanitize -- clean up utf8 sequences in a string
void
sanitize(char *dst,int uglyflg)
// uglyflg -- 1=output unknown utf8's
{
    char *src;
    char * const *xlatptr;
    const char *xlatstr;
    const char *utf8;
    int len;
    int foundflg;
    int chr;

    src = dst;

    while (1) {
        chr = *src;
        if (chr == 0)
            break;

        // skip translation loop if not utf-8
        if ((chr & 0x80) == 0) {
            *dst++ = *src++;
            continue;
        }

        // try to match a translation
        foundflg = 0;
        for (xlatptr = xlatlist;  ;  ++xlatptr) {
            xlatstr = *xlatptr;
            if (xlatstr == NULL)
                break;

            utf8 = xlatstr + 1;
            len = strlen(utf8);
            if (strncmp(src,utf8,len) == 0) {
                *dst++ = xlatstr[0];
                foundflg = 1;
                src += len;
                break;
            }
        }

        // utf8 translation found
        if (foundflg)
            continue;

        // NOTES:
        // (1) because of the optimization above, src _is_ pointing to a utf8
        //     but we have _no_ translation for it
        // (2) we can choose to skip it or just output it [and hope for the
        //     best], but ...
        // (3) first, we need to get the length utf8 string, so we only
        //     skip/output one utf8 string/char (e.g. we could have
        //     back-to-back utf8 strings)
        // (4) for reference here, the utf8 encoding is:
        //       byte 0: 11xxxxxx
        //       byte 1: 10xxxxxx

        // output the first char of the unknown utf8 sequence
        if (uglyflg)
            *dst++ = *src;
        ++src;

        // output the remaining ones
        for (;  ; ++src) {
            chr = *src;

            // EOS
            if (chr == 0)
                break;

            // back to ascii
            if ((chr & 0x80) == 0)
                break;

            // start of new utf8 string
            if ((chr & 0x40) != 0)
                break;

            // output the unknown utf8 secondary char
            if (uglyflg)
                *dst++ = chr;
        }
    }

    *dst = 0;
}

int
main(int argc,char **argv)
{
    char *z;

    --argc;
    ++argv;

    z = *argv;
    if (z == NULL) {
        printf("no argument provided\n");
        exit(1);
    }

    printf("PRINT : %s\n",z); 

    sanitize(z,0);
    printf("Sanitized string: %s\n",z); 

    return 0;
}

C程序來替換擴展的ASCII並在cli上測試/打印它們

問題描述

...

1 個解決方案

解決方案1
2 2015-10-11 23:44:14

C程序來替換擴展的ASCII並在cli上測試/打印它們

問題描述

...

1 個解決方案

解決方案1 2 2015-10-11 23:44:14

解決方案1
2 2015-10-11 23:44:14