简体   繁体   中英

C program to replace extended ASCII and test/print them on the cli

I currently have following code to lookup/replace (via a lookup list which is actually a char array of char arrays) extended ASCII characters. The replacing itself seems to work fine (although any tips for improvement, always welcome) but when using it on the cli (Ubuntu 15.04) , I get weird symbols back. Now, I'm a bit confused if this because my C code is not good enough or my terminal does not "know" how to print certain characters?

-------------- C code --------------

/* Include system header files.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>




unsigned char* sanitizeString(unsigned char *pCharArg1)
{
    unsigned char *pCharWorker = pCharArg1;

    /* The look-up map
     */
    unsigned char* charLookup[] = { "ab","àa", "ss", "åa", "ÅA", "ÿy", "XX","" };

    /* For every character in the input string we're going to verify
     * if the character needs to be replaced with one from the look-up
     * map.
     */
    while ( *pCharWorker != '\0' ) { 
        printf( "STARTING NEXT CHAR \n");   
        int finishedFlag = 0;
        //if ( (((int) *pCharWorker >= 65) && ((int) *pCharWorker <= 122)) ) {
            int j = 0;  
            /*
             * Loop the look-up map
             */
            while ((*(charLookup[j]) !='\0') && (finishedFlag == 0)) {
                printf( "Analazying *pCharWorker CHAR : %c \n", *pCharWorker    );
                printf( "Analazying *pCharWorker INT : %d \n", *pCharWorker    );
                printf( "Analazying *(charLookup[j]) CHAR  : %c \n", *(charLookup[j])    );         
                printf( "Analazying *(charLookup[j]) INT : %d \n", *(charLookup[j])    );       
                /* Inspected character matches one from the lookup map,
                 * so fetch the new character and assign it.
                 */
                if( *pCharWorker == *(charLookup[j]) ){
                    printf( "Detected char: %c \n", *pCharWorker   ); 
                    *pCharWorker = *(charLookup[j]+1);
                    printf( "Replaced with char: %c \n", *pCharWorker   ); 
                    finishedFlag = 1;   
                }
                j++;
            }
    //  }    
        printf( "======================= \n"  );             
        pCharWorker++;      
    }
    return pCharArg1;     
}


int main( int argc,  char* argv[] ){
    unsigned char* z = argv[1];
    printf( "PRINT : %s \n",  z ); 
    unsigned char* p2 = sanitizeString( z);
    printf( "Sanitized string: %s \n",  p2 ); 
    return 0;
}

Gives for example when executing:

koen@beite-f1:~$ gcc -o san sanitize.c

koen@beite-f1:~$ ./san ç

PRINT : ç

STARTING NEXT CHAR

Analazying *pCharWorker CHAR :

...

Sanitized string:


A great thanks for any input

br, Koen.

Your translation is failing because when charLookup is created some of the strings are longer than 2 chars because C is encoding them as variable length UTF-8. You've got utf8_string,output_char Dump the strings out in hex and you'll see.

For example, the second element that translates an accented "a" has a hex value of

C3 A0 61 00

Consider reversing the order within each of the elements in charLookup. That way, you'll have output_char,utf8_string and the second element becomes:

61 C3 A0 00

That way you can modify your code a bit. Note that you'll need to split pCharWorker into a source/dest pointers as in pCharInput/pcharOutput

char *xlat = charLookup[j];
char clean_char = xlat[0];
char *dirty_utf8 = xlat + 1;
int dirty_len = strlen(dirty_utf8);

if (strncmp(pCharInput,dirty_utf8,dirty_len) == 0) {
    *pCharOutput++ = clean_char;
    pCharInput += dirty_len
}
else {
    *pCharOutput++ = *pCharInput++;
}

NOTE: at the bottom of the function, you'll need a *pCharOutput = 0; that you didn't need before.

The above is just a fragment to give you the idea but it should be easy to incorporate. Note I did xlat et. al. as defs with assignments for brevity. You may split them into defs at func top and assignments in loop body if you wish.

You can also add an optimization, taking advantage of the fact that you can only have a UTF-8 string at the current position in the input string if the char is >= 0x80 (MSB set). Then, you can skip a pass through charLookup. So:

// skip charLookup scan if unnecessary
if ((*pCharInput & 0x80) == 0) {
    *pcharOutput++ = *pCharInput++;
    continue;
}

UPDATE:
Since you were amenable to tips, here's the full boat as I would do it. Note that the translation array should be global/static or the func prolog will recreate every time on entry. Also, the strlen/strncmp is unnecessary. I've also changed the loops around.

NOTE: This example has special handling for utf8 input that is not found in the translation, so at a mininum take a look at it.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>

char *xlatlist[] = { "ba","aà", "ss", "aå", "AÅ", "yÿ", "XX", NULL };

// sanitize -- clean up utf8 sequences in a string
void
sanitize(char *dst,int uglyflg)
// uglyflg -- 1=output unknown utf8's
{
    char *src;
    char * const *xlatptr;
    const char *xlatstr;
    const char *utf8;
    int len;
    int foundflg;
    int chr;

    src = dst;

    while (1) {
        chr = *src;
        if (chr == 0)
            break;

        // skip translation loop if not utf-8
        if ((chr & 0x80) == 0) {
            *dst++ = *src++;
            continue;
        }

        // try to match a translation
        foundflg = 0;
        for (xlatptr = xlatlist;  ;  ++xlatptr) {
            xlatstr = *xlatptr;
            if (xlatstr == NULL)
                break;

            utf8 = xlatstr + 1;
            len = strlen(utf8);
            if (strncmp(src,utf8,len) == 0) {
                *dst++ = xlatstr[0];
                foundflg = 1;
                src += len;
                break;
            }
        }

        // utf8 translation found
        if (foundflg)
            continue;

        // NOTES:
        // (1) because of the optimization above, src _is_ pointing to a utf8
        //     but we have _no_ translation for it
        // (2) we can choose to skip it or just output it [and hope for the
        //     best], but ...
        // (3) first, we need to get the length utf8 string, so we only
        //     skip/output one utf8 string/char (e.g. we could have
        //     back-to-back utf8 strings)
        // (4) for reference here, the utf8 encoding is:
        //       byte 0: 11xxxxxx
        //       byte 1: 10xxxxxx

        // output the first char of the unknown utf8 sequence
        if (uglyflg)
            *dst++ = *src;
        ++src;

        // output the remaining ones
        for (;  ; ++src) {
            chr = *src;

            // EOS
            if (chr == 0)
                break;

            // back to ascii
            if ((chr & 0x80) == 0)
                break;

            // start of new utf8 string
            if ((chr & 0x40) != 0)
                break;

            // output the unknown utf8 secondary char
            if (uglyflg)
                *dst++ = chr;
        }
    }

    *dst = 0;
}

int
main(int argc,char **argv)
{
    char *z;

    --argc;
    ++argv;

    z = *argv;
    if (z == NULL) {
        printf("no argument provided\n");
        exit(1);
    }

    printf("PRINT : %s\n",z); 

    sanitize(z,0);
    printf("Sanitized string: %s\n",z); 

    return 0;
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM