C 中的连续字符串替换

Question

Please see the following code, which performs consecutive character/string replacements by looping through all the utf8 characters to be replaced;请看下面的代码，它通过循环遍历所有要替换的utf8字符来执行连续的字符/字符串替换； would you propose another, more efficient, construct?你会提出另一个更有效的结构吗？

static char *utf8[66] =  { "◊",    "⎕",    "⍞",    "⌹",    "⊤",    "⊥",
                           "⌶",    "⌈",    "∪",    "⍕",    "⍎",    "│",
                           "⍟",    "∆",    "∇",    "→",    "←",    "⌊",
                           "┐",    "└",    "─",    "↑",    "↓",    "≡",
                           "⍸",    "⋸",    "∵",    "⌷",    "⍂",    "⌻",
                           "⊣",    "⊢",    "⋄",    "┘",    "┌",    "⍺",
                           "⊂",    "⊃",    "⍝",    "⍲",    "⍴",    "⍱",
                           "⌽",    "⊖",    "○",    "∨",    "⍳",    "⍬",
                           "∈",    "∩",    "⌿",    "⍀",    "≥",    "≤",
                           "≠",    "×",    "÷",    "⍙",    "∘",    "⍵",
                           "⍫",    "⍋",    "⍒",    "¯",    "¨",    NULL    };

static char *ebcdic[66] = { "\x8d", "\x90", "\x91", "\x92", "\x98", "\x9d",
                           "\x9f", "\xa9", "\xac", "\xae", "\xaf", "\xb3",
                           "\xb5", "\xb6", "\xb7", "\xb8", "\xbd", "\xbe",
                           "\xbf", "\xc0", "\xc4", "\xc6", "\xc7", "\xcf",
                           "\xd0", "\xd1", "\xd2", "\xd3", "\xd4", "\xd5",
                           "\xd6", "\xd7", "\xd8", "\xd9", "\xda", "\xe0",
                           "\xe2", "\xe3", "\xe4", "\xe5", "\xe6", "\xe7",
                           "\xe8", "\xe9", "\xea", "\xeb", "\xec", "\xed",
                           "\xee", "\xef", "\xf0", "\xf1", "\xf2", "\xf3",
                           "\xf4", "\xf5", "\xf6", "\xf7", "\xf8", "\xf9",
                           "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", NULL    };

char* convert(char *line) {
  char *buffer1;
  char *buffer2;
  char *tmp;
  int i=0;

  buffer1 = malloc(strlen(line));
  strcpy(buffer1, line);
  while(ebcdic[i]) {
    buffer2 = replace(buffer1, utf8[i], ebcdic[i]);
    free(buffer1);
    buffer1 = malloc(strlen(buffer2));
    strcpy(buffer1, buffer2);
  }
  tmp = malloc(strlen(buffer1 + 1));
  sprintf(tmp, "%s\n", buffer1);

  free(buffer1);
  free(buffer2);
  return tmp;
}

char* replace(const char* s, const char* oldW, const char* newW) {
  char* result;
  int i, cnt = 0;
  int newWlen = strlen(newW);
  int oldWlen = strlen(oldW);

  for (i = 0; s[i] != '\0'; i++) {
    if (strstr(&s[i], oldW) == &s[i]) {
      cnt++;
      i += oldWlen - 1;
    }
  }
  result = (char*)malloc(i + cnt * (newWlen - oldWlen) + 1);
  i = 0;
  while (*s) {
    if (strstr(s, oldW) == s) {
      strcpy(&result[i], newW);
      i += newWlen;
      s += oldWlen;
    } else {
      result[i++] = *s++;
    }
  }
  result[i] = '\0';
  return result;
}

update-001: added code for replace(). update-001：为 replace() 添加了代码。
update-002: changed for/loop to while. update-002：将 for/loop 更改为 while。

Thank you for looking, I care about readability and memory usage more than performance in this particular case.感谢您的观看，在这种特殊情况下，我更关心可读性和 memory 的使用，而不是性能。

Answer 1

I assume you are trying to write that code as a learning experience, otherwise scrape it a use an existing tool/library.我假设您正在尝试编写该代码作为一种学习经验，否则将其刮掉并使用现有的工具/库。

When you want to convert characters/codepoints, the basic algorithm goes like this:当你想转换字符/代码点时，基本算法是这样的：

Get next codepoint from input string, convert that codepoint (or keep as is), store the converted codepoint at end of output string.从输入字符串中获取下一个代码点，转换该代码点（或保持原样），将转换后的代码点存储在 output 字符串的末尾。 Repeat.重复。

Since your input string uses one char per codepoint, getting the next codepoint is just as simple as looping through the 'char's in the input string.由于您的输入字符串每个代码点使用一个char ，因此获取下一个代码点就像循环输入字符串中的“char”一样简单。 It also means that the codepoint conversion can be used by a simple lookup table of size 256 (assuming 8bit char s).这也意味着代码点转换可以由大小为 256（假设 8 位char s）的简单查找表使用。 The utf8 codepoints are not necessarily of length 1, so we have to account for that. utf8 代码点的长度不一定为 1，因此我们必须考虑到这一点。

/* This syntax just means that the array is intialized with 
   ebdic2utf8_lut[0x8d] = "◊", ebdic2utf8_lut[0x90] = "⎕", etc.
   Array elements that are not explicit assigned in the initialization
   list will be initialized to `0` (or NULL) 
   We may treat array elements with value `0` as "keep as is" */

static const char *const ebdic2utf8_lut[256] = {
  [0x8d] = "◊",
  [0x90] = "⎕",
  [0x91] = "⍞",
  [0x92] = "⌹",
  /* Rest of initializations left out for brevity */
};

char * convert(const char *src)
{
   /* Allocate space for empty string*/
   char *dst = calloc(1, 1);
   if (!dst)
      {
         perror("String allocation failed");
         exit(1);
      }

   size_t dst_length = 0;

   while(*src)
     {
        /* We want to lose the sign of `char` for the lut */
        unsigned char ch = *src;

        /* Convert next character */
        const char *utf8 = ebdic2utf8_lut[ch];

        /* If there is no conversion we keep it as is
           But the rest of the function works with strings, so we
           put the input character in a string with length 1 */
        char keep_as_is[2];
        if (!utf8)
          {
            keep_as_is[0] = ch;
            keep_as_is[1] = '\0'; /* Zero termination */
            utf8 = keep_as_is;
          }

        size_t utf8_length = strlen(utf8);

        size_t new_dst_length = dst_length + utf8_length;

        /* Resize destination string to allow for appending 
           (including zero-termination) */
        char *new_dst = realloc(dst, new_dst_length + 1);
        if (!new_dst)
          {
            perror("String resize failed");
            exit(1);
          }
        dst = new_dst;

        /* Append converted character to destination string*/ 
        strcpy(dst + dst_length, utf8);


        dst_length = new_dst_length;
        src++;
     }

   return dst;
} 

int main(void)
{
  char *str = convert("Hello\x90\x91\x92World");

  /* This should print "Hello⎕⍞⌹World", unless you are under Windows.
     Windows and utf-8 doesn't mix very well */
  printf("%s\n", str);
}

This code has not been checked for errors, use at own risk, etc..此代码尚未检查错误，使用风险自负等。

Answer 2

To convert from utf8 to ebdic, it is still possible to use a lookuptable.要从 utf8 转换为 ebdic，仍然可以使用查找表。 But having a LUT going from utf8->ebdic is impractical because of its size.但是从 utf8->ebdic 开始的 LUT 是不切实际的，因为它的大小。 But we can use an ebdic->utf8 LUT and loop through it to find a match.但是我们可以使用 ebdic->utf8 LUT 并循环遍历它以找到匹配项。

The basic algorithm for the string conversion, is basically still the same, get the next utf8 codepoint/character in input string, convert that to ebdic, push the converted char to output string (or push one char if no conversion was found), remove the prefix from input string, repeat.字符串转换的基本算法，基本上还是一样的，获取输入字符串中的下一个 utf8 代码点/字符，将其转换为 ebdic，将转换后的字符推送到 output 字符串（如果未找到转换，则推送一个字符），删除输入字符串的前缀，重复。

When doing utf8->ebdic conversion, we have to be aware that an utf8 codepoint may be multiple bytes.在进行 utf8->ebdic 转换时，我们必须注意一个 utf8 代码点可能是多个字节。 So we have to compare multiple bytes in the prefix of the input string, and we must also increment the input string with multiple bytes.所以我们必须比较输入字符串前缀中的多个字节，我们还必须用多个字节递增输入字符串。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>


static const char *const ebdic2utf8_lut[256] = {
  [0x8d] = "◊", [0x90] = "⎕", [0x91] = "⍞", [0x92] = "⌹",
  [0x98] = "⊤", [0x9d] = "⊥", [0x9f] = "⌶", [0xa9] = "⌈",
  [0xac] = "∪", [0xae] = "⍕", [0xaf] = "⍎", [0xb3] = "│",
  [0xb5] = "⍟", [0xb6] = "∆", [0xb7] = "∇", [0xb8] = "→",
  [0xbd] = "←", [0xbe] = "⌊", [0xbf] = "┐", [0xc0] = "└",
  [0xc4] = "─", [0xc6] = "↑", [0xc7] = "↓", [0xcf] = "≡",
  [0xd0] = "⍸", [0xd1] = "⋸", [0xd2] = "∵", [0xd3] = "⌷",
  [0xd4] = "⍂", [0xd5] = "⌻", [0xd6] = "⊣", [0xd7] = "⊢",
  [0xd8] = "⋄", [0xd9] = "┘", [0xda] = "┌", [0xe0] = "⍺",
  [0xe2] = "⊂", [0xe3] = "⊃", [0xe4] = "⍝", [0xe5] = "⍲",
  [0xe6] = "⍴", [0xe7] = "⍱", [0xe8] = "⌽", [0xe9] = "⊖",
  [0xea] = "○", [0xeb] = "∨", [0xec] = "⍳", [0xed] = "⍬",
  [0xee] = "∈", [0xef] = "∩", [0xf0] = "⌿", [0xf1] = "⍀",
  [0xf2] = "≥", [0xf3] = "≤", [0xf4] = "≠", [0xf5] = "×",
  [0xf6] = "÷", [0xf7] = "⍙", [0xf8] = "∘", [0xf9] = "⍵",
  [0xfa] = "⍫", [0xfb] = "⍋", [0xfc] = "⍒", [0xfd] = "¯",
  [0xfe] = "¨" };


/* Match an utf8 string with prefix in `str` and return the corresponding
   ebdic character */

char utf8lookup(const char *str, size_t *increment)
{
  for (size_t n = 0; n < 256; n++)
    {
      const char *utf8 = ebdic2utf8_lut[n];
      if (utf8)
    {
      size_t len = strlen(utf8);
      if (strncmp(str, utf8, len) == 0)
        {
          *increment = len;
          return (char)n;
        }
    }
    }
  return 0;
}



char * convert_u2e(const char *src)
{
   /* Allocate space for empty string*/
   char *dst = calloc(1, 1);
   if (!dst)
      {
         perror("String allocation failed");
         exit(1);
      }

   size_t dst_length = 0;

   while(*src)
     {

       /* Convert next character */
       size_t src_increment;
       char ch = utf8lookup(src, &src_increment);

       /* If there is no conversion we the first character int `src` as is */
        if (!ch)
          {
        ch = *src;
        src_increment = 1;
          }

        size_t new_dst_length = dst_length + 1;

        /* Resize destination string to allow for appending 
           (including zero-termination) */
        char *new_dst = realloc(dst, new_dst_length + 1);
        if (!new_dst)
          {
            perror("String resize failed");
            exit(1);
          }
        dst = new_dst;

        /* Append converted character to destination string*/ 
        dst[dst_length] = ch;

        dst_length = new_dst_length;
        src += src_increment;
     }

   /* Zero terminate */
   dst[dst_length] = '\0';
   
   return dst;
} 

int main(void)
{
  char *str = convert_u2e("Hello⎕⍞⌹World");

  for (const char *s = str; *s; s++)
    printf("%hhx ", (unsigned char)*s);
  printf("\n");

  free(str);
}

C 中的连续字符串替换

问题描述

2 个解决方案

解决方案1
0 已采纳 2020-12-02 00:06:14

解决方案2
0 2020-12-03 19:47:12

C 中的连续字符串替换

问题描述

2 个解决方案

解决方案1 0 已采纳 2020-12-02 00:06:14

解决方案2 0 2020-12-03 19:47:12

解决方案1
0 已采纳 2020-12-02 00:06:14

解决方案2
0 2020-12-03 19:47:12