繁体   English   中英

在 cURL 中使用 UTF-8 数据流,数据流何时从 UTF-8 转换为 ASCII?

[英]Using UTF-8 Datastreams in cURL, when is the datastream converted from UTF-8 to ASCII?

使用 cURL 将字符流从 Internet 中取出时,数据流何时从多字节数据类型转换为单字节字符数组?

在这里写了一个程序,它似乎在回调 function 中使用 ASCII 工作。

但是,我编写了另一个程序,它使用 UTF-8 和 wchar_t 数据类型,这似乎也可以工作。 即使 wchar_t 类型在我的机器上是 4 个字节,而 char 是 1 个字节,数据流似乎也没有区分这两种数据类型。

我猜这个程序有某种类型的转换是透明的,但我不知道(我认为在 UTF-8 ASCII 字符仍然占用 memory 的 1 个字节,但是当程序使用 wchar_t 数据类型时,系统填充常规带有零的 ascii 字符将它们转换为 4 个字节,但这不是程序员实现的......)。

#include "multicurl.h"

#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */

/*  The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
    // This prototype is provided by cURL, with an argument at the end for our data structure.
    // This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
    
    size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
    MemType *mem = (MemType *)userdata;
    wchar_t *tmp = realloc(mem->memory, mem->size  + realsize + sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.

    if (tmp == NULL){
        printf("Not Enough Memory, realloc returned NULL.\n");
        exit(EXIT_FAILURE);
    }

    mem->memory = tmp;
    memcpy(&(mem->memory[ mem->size /  sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
    mem->size += realsize;// The actual size, in bytes, is realsize + ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
    mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
    // We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.

    return (size * nmemb);// cURL crosschecks the datastream with this return value.
}

void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.

    /* Convert our UTF-8 URL string to a regular ASCII URL string. */
    char* url = (char*) malloc ( wcslen( utf8_url ) + 1 );
    wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );

    CURL *hnd = NULL;
    output->memory = malloc( sizeof( wchar_t ) );              // Initialize the memory component of the structure.
    output->size = 0;                                           // Initialize the size component of the structure.

    // Initialize the cURL handle.
    hnd = curl_easy_init();

    if(hnd){

        // Setup the cURL options.
        curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
        curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
        curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
        curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
        curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
        curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
        curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
        curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
        curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
        //curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);

        curl_multi_add_handle(mh, hnd);
    }else{
        output->memory[0] = '\0';
    }    
    return NULL;// The output struct was passed by reference no need to return anything.
}

CURLM *SetUpMultiCurlHandle(){
    curl_global_init(CURL_GLOBAL_ALL);

    CURLM * mh = curl_multi_init();
    return mh;
}

void *PerformMultiCurl(CURLM * mh) 
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
   Remove the handles from memory.*/
{
    CURLMsg *msg=NULL;
    CURL *hnd = NULL;
    CURLcode return_code = 0;
    int still_running = 0;
    int msgs_left = 0;

    curl_multi_perform(mh, &still_running);// Perform the requests.
    do {
        int numfds=0;
        int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
        if(res != CURLM_OK) {
            fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
            return NULL;
        }
        curl_multi_perform(mh, &still_running);
        
       /* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
           The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
    } while(still_running); 
    
    
    /* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
    while ((msg = curl_multi_info_read(mh, &msgs_left))) {
        if (msg->msg == CURLMSG_DONE) {
            hnd = msg->easy_handle;

            return_code = msg->data.result;
            if(return_code!=CURLE_OK) {
                fprintf(stderr, "CURL error code: %d\n", msg->data.result);
                continue;
            }

            curl_multi_remove_handle(mh, hnd);
            curl_easy_cleanup(hnd);
            hnd = NULL;
        }
        else {
            fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
        }
    }

    curl_multi_cleanup(mh);
    curl_global_cleanup();
    return NULL;
}

可以在此处找到该程序的完整 UTF-8 变体

如您所料,它不起作用 libcurl 无法知道 function 期望wchar_t*而它应该期望char*

如果您检查MyOutputStruct1.memory[0] ,您会发现它不包含应有的内容。 例如,当请求https://stackoverflow.com时,它包含 0x4f44213c。 这显然是错误的,因为这远远超出了有效代码点的范围。 这实际上是卡在一个wchar_t中的前四个代码点 ( <!DO )(按 LE 顺序)。

由于第二个错误,它似乎可以工作。 打印宽字符串时,您需要使用%ls ,而不是%s

wprintf(L"Output:\n%s\n", MyOutputStruct1.memory);

应该

printf("Output:\n%ls\n", MyOutputStruct1.memory);
// -or-
wprintf(L"Output:\n%ls\n", MyOutputStruct1.memory);

基本上,代码需要一个char*贯穿始终。 指针的类型是wchar_t* ,但它在任何地方都用作char* 因此,这两个错误大多在相关程序中“取消”。 (我没看,但我预计输入的长度不能被sizeof(wchar_t)整除。)如果指针实际上被用作wchar_t* (例如,如果它的元素已经过检查,或者如果它已被传递给w函数),问题将很明显。

正如评论部分所述,所有这些真正需要的是 UTF-8 解析器。 字符可以保存 UTF-8 但我们不能轻易地单独处理每个字符而不将它们转换为其他数据类型[一些 UTF-8 字符大于 1 个字节]。 所以我在 libutf-8 的帮助下写了一个解析器。

/* gcc unicode.c -o unicode -lutf-8 
This program makes use of libutf-8.
http://www.whizkidtech.redprince.net/i18n/
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>
#include <utf-8.h>

int* parse_UTF8_bitstream( size_t *len, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide integer bytestream [so we can address each UTF-8 character individually] */
{
    *len = 0; // This will give us the number of wide-characters not counting NULL.
    int i = 0;
    int n;
    unsigned int *output = malloc ( sizeof( unsigned int ) );
    unsigned int *temp;
    while ( input_stream[ i ] ){
        temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
        output = temp;
        output[ *len ] = sgetu8( (unsigned char *) &input_stream[ i ], &n );
        i+= n; //Skip this many chars to the next UTF-8 code.
        *len = *len + 1;
    }
    
    /* Make sure the last character is NULL */
    temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
    output = temp;
    output[ *len ] = 0;
    
    return (int*)output; // This is our wide character string.
}

void process_string(const char *s)
{
    printf("%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    size_t len;
    int* outputstream = parse_UTF8_bitstream( &len, s );
    
    printf("\n%ls\n", outputstream);
    printf("LENGTH: %lu Wide Characters\n", len);
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main(void)
/* "Hello World", in Arabic, Russian, and Greek */
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
          
    return 0;
}

这是我之前发布的同一个程序,但是,它不需要任何特殊的库。 它使用stdlib中的mbtowc() function(我的解析器不做错误检查,如果输入stream不包含有效的ZAE3B3DF9970B49B652Z3E608759BC,结果是未定义)。95

从 mbtowc() 手册页:

#include <stdlib.h>

int mbtowc(wchar_t * 限制 wcharp, const char * 限制 mbchar, size_t nbytes);

如果 mbchar 是 NULL,则 mbtowc() function 如果支持移位状态则返回非零,否则返回零。

否则,如果 mbchar 不是 null 指针,如果 mbchar 表示 null 宽字符,则 mbtowc() 返回 0,或者返回在 mbchar 中处理的字节数,或者如果无法识别或转换多字节字符,则返回 -1。 在这种情况下,mbtowc() 的内部转换 state 是未定义的。

/* gcc unicode.c -o unicode  */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>


wchar_t* parse_UTF8_bitstream( size_t *len, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide wchar_t bytestream [so we can address each UTF-8 character individually] */
{
    *len = 0; /* This will give us the number of wide-characters not counting NULL. */
    int i = 0; /* This iterates through the mb char stream. */
    size_t wc_size = sizeof( wchar_t ); /* The size of our destination datatype. */
    wchar_t *output = ( wchar_t* ) malloc ( sizeof( wchar_t ) );
    wchar_t *temp;
    while ( input_stream[ i ] ){
        temp = ( wchar_t* ) realloc(output, (*len + 1) * sizeof( wchar_t ) );
        output = temp;
        /* i skips this many chars to the next UTF-8 code.*/
        i += mbtowc( &output[ *len ], &input_stream[ i ], wc_size );
        *len = *len + 1;
    }
    
    /* Make sure the last wide-character is NULL */
    temp = ( wchar_t* ) realloc(output, (*len + 1) * sizeof( wchar_t ) );
    output = temp;
    output[ *len ] = 0;
    
    return output; /* This is our wide character string. */
}

void process_string(const char *s)
{
    printf("%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    size_t len;
    wchar_t* outputstream = parse_UTF8_bitstream( &len, s );
    
    printf("\n%ls\n", outputstream);
    printf("LENGTH: %lu Wide Characters\n", len);
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main(void)
/* "Hello World", in Arabic, Russian, and Greek */
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
          
    return 0;
}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM