简体   繁体   English

R中的Unicode字符转换

[英]unicode characters conversion in R

I have this MTST column, which when printed yields 我有这个MTST列,在打印时

 [1] "<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>                                 "
 [2] "<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S                                "
 [3] "<U+0391><U+0399>G<U+0399><U+039D><U+0391>                                  "
 [4] "<U+0391><U+0399>G<U+0399><U+039F>                                   "
 [5] "<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S                                 "
 [6] "<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)                          "
 [7] "<U+0391><U+039B><U+0395><U+039E><U+0391><U+039D><U+0394><U+03A1><U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>                          "
 [8] "<U+0391><U+039B><U+0399><U+0391><U+03A1><U+03A4><U+039F>S                                "

I tried using Unicode library and do MTST<- as.u_char(MTST) that gives 我尝试使用Unicode库并执行MTST<- as.u_char(MTST)

[1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>

I've also tried dump and dput but nothing changed. 我也尝试过dumpdput但是什么都没有改变。

Note that MTST is initially of type character . 请注意, MTST最初是character类型。

Appreciate your help. 感谢您的帮助。 Thanks 谢谢

edit: below dput(MTST) is shown 编辑:dput(MTST)下面显示

c("<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>                                 ",
"<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S                                ",
"<U+0391><U+0399>G<U+0399><U+039D><U+0391>                                  ",
"<U+0391><U+0399>G<U+0399><U+039F>                                   ",
"<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S                                 ",
"<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)                          ",
"<U+0391><U+039B><U+0395><U+039E><U+0391><U+039D><U+0394><U+03A1><U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>                          ",
"<U+0391><U+039B><U+0399><U+0391><U+03A1><U+03A4><U+039F>S                                ",
"<U+0391><U+039D><U+0391><U+0392><U+03A1><U+03A5><U+03A4><U+0391>                                ",
"<U+0391><U+039D><U+0394><U+03A1><U+0391><U+0392><U+0399><U+0394><U+0391>                               ",
"<U+0391><U+039D>OG<U+0395><U+0399><U+0391>                                 ",
"<U+0391><U+03A1><U+0391><U+039E><U+039F>S                                  ",
"<U+0391><U+03A1><U+0391><U+03A7>O<U+0392><U+0391>                                 ",
"<U+0391><U+03A1>G<U+039F>S(<U+03A0><U+03A5><U+03A1>G<U+0395><U+039B><U+0391>)                          ",
"<U+0391><U+03A1>G<U+039F>S<U+03A4><U+039F><U+039B><U+0399>                               ",
"<U+0391><U+03A1><U+03A4><U+0391> (<U+03A0><U+039F><U+039B><U+0397>)                             ",
"<U+0391><U+03A1><U+03A4><U+0391> (F<U+0399><U+039B><U+039F>T<U+0395><U+0397>)                          ",
"<U+0391>S<U+03A4><U+0395><U+03A1><U+039F>S<U+039A><U+039F><U+03A0><U+0395><U+0399><U+039F>                           ",
"<U+0391>S<U+03A4><U+03A1><U+039F>S                                  ",
"<U+0391>S<U+03A4><U+03A5><U+03A0><U+0391><U+039B><U+0391><U+0399><U+0391>                              ",
"<U+0392><U+0391><U+039C><U+039F>S                                   ",
"<U+0392><U+0395><U+039B><U+039F> (<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+0399><U+0391>S)                        ",
"<U+0392><U+039F><U+039B><U+039F>S                                   ",
"<U+0392><U+03A5><U+03A4><U+0399><U+039D><U+0391>                                  ",
"G<U+039F><U+03A1><U+03A4><U+03A5>S                                  ",
"G<U+03A5>T<U+0395><U+0399><U+039F>                                  ",
"<U+0394><U+0395>SF<U+0399><U+039D><U+0391>                                 ",
"<U+0394><U+0399><U+0391><U+0392><U+039F><U+039B><U+0399><U+03A4>S<U+0399>                              ",
"<U+0394><U+039F><U+039C><U+039F><U+039A><U+039F>S                                 ",
"<U+0394><U+03A1><U+0391><U+039C><U+0391>                                   ",
"<U+0395><U+0394><U+0395>SS<U+0391>                                  ",
"<U+0395><U+039B><U+0395><U+03A5>S<U+0399><U+039D><U+0391>                                ",
"<U+0395><U+039B><U+039B><U+0397><U+039D><U+0399><U+039A><U+039F> ae<U+03C1>                            ",
"<U+0396><U+0391><U+039A><U+03A5><U+039D>T<U+039F>S                                ",
"<U+0396><U+0391><U+039A><U+03A5><U+039D>T<U+039F>S_<U+03A0><U+039F><U+039B><U+0397>                           ",
"<U+0396><U+0391><U+03A1><U+039F>S                                   ",
"<U+0397><U+03A1><U+0391><U+039A><U+039B><U+0395><U+0399><U+039F>                                ",
"T<U+0391>S<U+039F>S                                   ", "T<U+0397><U+03A1><U+0391> (S<U+0391><U+039D><U+03A4><U+039F><U+03A1><U+0399><U+039D><U+0397>",
"<U+0399><U+0395><U+03A1><U+0391><U+03A0><U+0395><U+03A4><U+03A1><U+0391>                               ",
"<U+0399><U+039A><U+0391><U+03A1><U+0399><U+0391>_<U+0391>/<U+0394>                              ",
"<U+0399>O<U+0391><U+039D><U+039D><U+0399><U+039D><U+0391>                                ",
"<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391> (<U+03A0><U+039F><U+039B><U+0397>)                           ",
"<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391>(<U+0391><U+039C><U+03A5>G<U+0394><U+0391><U+039B><U+0395>O<U+039D><U+0391>S)                    ",
"<U+039A><U+0391><U+039B><U+0391><U+0392><U+03A1><U+03A5><U+03A4><U+0391>                               ",
"<U+039A><U+0391><U+039B><U+0391><U+039C><U+0391><U+03A4><U+0391>                                ",
"<U+039A><U+0391><U+039B><U+0391><U+039C><U+03A0><U+0391><U+039A><U+0391>                               ",
"<U+039A><U+0391><U+03A1><U+0394><U+0399><U+03A4>S<U+0391>                                ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0391>T<U+039F>S_<U+0391>/<U+0394>                            ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0391>T<U+039F>S_<U+03A0><U+039F><U+039B><U+0397>                           ",
"<U+039A><U+0391><U+03A1><U+03A0><U+0395><U+039D><U+0397>S<U+0399>                               ",
"<U+039A><U+0391><U+03A1><U+03A5>S<U+03A4><U+039F>S                                ",
"<U+039A><U+0391>S<U+039F>S                                   ",
"<U+039A><U+0391>S<U+03A4><U+0395><U+039B><U+039B><U+0399>                                ",
"<U+039A><U+0391>S<U+03A4><U+039F><U+03A1><U+0399><U+0391>                                ",
"<U+039A><U+0395><U+03A1><U+039A><U+03A5><U+03A1><U+0391>                                 ",
"<U+039A><U+039F><U+0396><U+0391><U+039D><U+0397>                                  ",
"<U+039A><U+039F><U+039C><U+039F><U+03A4><U+0397><U+039D><U+0397>                                ",
"<U+039A><U+039F><U+039D><U+0399><U+03A4>S<U+0391>                                 ",
"<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+039F>S                                ",
"<U+039A><U+03A5>T<U+0397><U+03A1><U+0391>_<U+0391>/<U+0394>                              ",
"<U+039A><U+03A5><U+039C><U+0397>                                    ",
"<U+039A>OS                                     ", "<U+039A>OS_<U+03A0><U+039F><U+039B><U+0397>                                ",
"<U+039B><U+0391><U+039C><U+0399><U+0391>                                   ",
"<U+039B><U+0391><U+03A1><U+0399>S<U+0391>                                  ",
"<U+039B><U+0395><U+03A1><U+039F>S                                   ",
"<U+039B><U+0395><U+03A5><U+039A><U+0391><U+0394><U+0391> (<U+039D><U+0397>S<U+0399>)                          ",
"<U+039B><U+0395>O<U+039D><U+0399><U+0394><U+0399><U+039F>                                ",
"<U+039B><U+0397><U+039C><U+039D><U+039F>S                                  ",
"<U+039B><U+0399><U+0394>O<U+03A1><U+0399><U+039A><U+0399>                                ",
"<U+039C><U+0391><U+039A><U+0395><U+0394><U+039F><U+039D><U+0399><U+0391>                               ",
"<U+039C><U+0391><U+03A1><U+0391>TO<U+039D><U+0391>S                               ",
"<U+039C><U+0395>TO<U+039D><U+0397>                                  ",
"<U+039C><U+0395>S<U+039F><U+039B><U+039F>GG<U+0399>                               ",
"<U+039C><U+0397><U+039B><U+039F>S_<U+0391><U+039C>S                               ",
"<U+039C><U+03A5><U+039A><U+039F><U+039D><U+039F>S                                 ",
"<U+039C><U+03A5><U+03A4><U+0399><U+039B><U+0397><U+039D><U+0397>                                ",
"<U+039D><U+0391><U+039E><U+039F>S                                   ",
"<U+039D><U+0391><U+03A5><U+03A0><U+0391><U+039A><U+03A4><U+039F>S                               ",
"<U+039D><U+0391><U+03A5><U+03A0><U+039B><U+0399><U+039F>                                 ",
"<U+039D><U+0395><U+0391> F<U+0399><U+039B><U+0391><U+0394><U+0395><U+039B>F<U+0395><U+0399><U+0391>                         ",
"<U+039E><U+0391><U+039D>T<U+0397>                                   ",
"<U+039F><U+03A1><U+0395>S<U+03A4><U+0399><U+0391><U+0394><U+0391>                               ",
"<U+03A0><U+0391><U+0399><U+0391><U+039D><U+0399><U+0391>                                 ",
"<U+03A0><U+0391><U+039B><U+0391><U+0399><U+039F><U+03A7>O<U+03A1><U+0391>                              ",
"<U+03A0><U+0391><U+03A1><U+039F>S_<U+0391>/<U+0394>                               ",
"<U+03A0><U+0391><U+03A4><U+03A1><U+0391>                                   ",
"<U+03A0><U+0395><U+0399><U+03A1><U+0391><U+0399><U+0391>S                                ",
"<U+03A0><U+039F><U+039B><U+03A5>G<U+03A5><U+03A1><U+039F>S                               ",
"<U+03A0><U+039F><U+03A4><U+0399><U+0394><U+0391><U+0399><U+0391>                                ",
"<U+03A0><U+03A4><U+039F><U+039B><U+0395><U+039C><U+0391><U+0399><U+0394><U+0391>                              ",
"<U+03A0><U+03A5><U+03A1>G<U+039F>S                                  ",
"<U+03A1><U+0391>F<U+0397><U+039D><U+0391>                                  ",
"<U+03A1><U+0395>T<U+03A5><U+039C><U+039D><U+039F>                                 ",
"<U+03A1><U+039F><U+0394><U+039F>S                                   ",
"S<U+0391><U+039C><U+039F>S                                   ",
"S<U+0395><U+0394><U+0395>S                                   ",
"S<U+0395><U+03A1><U+03A1><U+0395>S                                  ",
"S<U+0397><U+03A4><U+0395><U+0399><U+0391>                                  ",
"S<U+039A><U+0399><U+0391>T<U+039F>S                                 ",
"S<U+039A><U+039F><U+03A4><U+0399><U+039D><U+0391>                                 ",
"S<U+039A><U+03A5><U+03A1><U+039F>S                                  ",
"S<U+039F><U+03A5><U+0394><U+0391>                                   ",
"S<U+039F><U+03A5>F<U+039B><U+0399>                                  ",
"S<U+03A0><U+0391><U+03A1><U+03A4><U+0397>                                  ",
"S<U+03A0><U+0391><U+03A4><U+0391>(<U+0392><U+0395><U+039D><U+0399><U+0396><U+0395><U+039B><U+039F>S)                        ",
"S<U+03A0><U+0395><U+03A4>S<U+0395>S                                 ",
"S<U+03A4><U+0395>F<U+0391><U+039D><U+0399> (<U+039A><U+039F><U+03A1><U+0399><U+039D>T<U+0399><U+0391>S)                     ",
"S<U+03A5><U+039A><U+03A5>O<U+039D><U+0391>                                 ",
"S<U+03A5><U+03A1><U+039F>S_<U+0391>/<U+0394>                               ",
"<U+03A4><U+0391><U+039D><U+0391>G<U+03A1><U+0391>                                 ",
"<U+03A4><U+0391><U+03A4><U+039F><U+0399> (<U+0394><U+0395><U+039A><U+0395><U+039B><U+0395><U+0399><U+0391>)                        ",
"<U+03A4><U+0396><U+0395><U+03A1><U+039C><U+0399><U+0391><U+0394><U+0395>S                              ",
"<U+03A4><U+03A1><U+0399><U+039A><U+0391><U+039B><U+0391> <U+0397><U+039C><U+0391>T<U+0395><U+0399><U+0391>S                        ",
"<U+03A4><U+03A1><U+0399><U+039A><U+0391><U+039B><U+0391> T<U+0395>SS<U+0391><U+039B><U+0399><U+0391>S                       ",
"<U+03A4><U+03A1><U+0399><U+03A0><U+039F><U+039B><U+0397>                                 ",
"<U+03A4><U+03A5><U+039C><U+03A0><U+0391><U+039A><U+0399>                                 ",
"<U+03A4><U+03A5><U+03A1><U+0399><U+039D>T<U+0391>                                 ",
"F<U+0391><U+03A1>S<U+0391><U+039B><U+0391>                                 ",
"F<U+039B>O<U+03A1><U+0399><U+039D><U+0391>                                 ",
"F<U+039F><U+03A5><U+03A1><U+039D><U+0397>                                  ",
"F<U+03A5><U+03A7><U+03A4><U+0399><U+0391>                                  ",
"<U+03A7><U+0391><U+039B><U+039A><U+0399><U+0394><U+0391>                                 ",
"<U+03A7><U+0391><U+039D><U+0399><U+0391>                                   ",
"<U+03A7><U+0399><U+039F>S                                    ",
"<U+03A7><U+03A1><U+03A5>S<U+039F><U+03A5><U+03A0><U+039F><U+039B><U+0397>_<U+039A><U+0391><U+0392><U+0391><U+039B><U+0391>                       ",
"O<U+03A1><U+0395><U+039F><U+0399>                                   "
)

What you have there looks like plain 7-bit ASCII characters with some attempt at encoding Unicode code-points by wrapping some of them thus: <U+abcd> . 您所拥有的内容看起来像是普通的7位ASCII字符,并尝试通过包裹其中的一些代码来编码Unicode代码点: <U+abcd>

This is not a recognised encoding for Unicode, as far as I can tell, partly because how would you put a real < in your text? 据我所知,这不是公认的Unicode编码,部分原因是您将如何在文本中添加真实的< I suppose every < could be <U+jklm> where jklm is the code for an angle bracket... But ick. 我想每个<可能是<U+jklm> ,其中jklm是尖括号的代码...但是好了。

So, first, try and get a UTF-8 encoded string from whatever generated this ascii-encoded mess! 因此,首先,尝试从生成该ascii编码混乱的任何东西中获取UTF-8编码的字符串!

However... after some serious hair pulling... 但是...严重拉扯头发后...

stringi to the rescue! stringi来救援! Where 'MTST' is your vector of stuff, first convert the angle bracket notation to backslash-u and then use stri_unescape_unicode : 如果“ MTST”是您的素材矢量,请首先将尖括号符号转换为反斜杠-u,然后使用stri_unescape_unicode

> require(stringi)
> greek2=gsub(">","", gsub("<U\\+","\\\\u",MTST))
> stri_unescape_unicode(greek2)
[1] "ΑGΡΙΝΙΟ                                 "
[2] "ΑGΧΙΑΛΟS                                "
[3] "ΑΙGΙΝΑ                                  "
[4] "ΑΙGΙΟ                                   "
[5] "ΑΙΔΗΨΟS                                 "
[6] "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)                          "

all the way up to 一直到

[123] "FΥΧΤΙΑ                                  "
[124] "ΧΑΛΚΙΔΑ                                 "
[125] "ΧΑΝΙΑ                                   "
[126] "ΧΙΟS                                    "
[127] "ΧΡΥSΟΥΠΟΛΗ_ΚΑΒΑΛΑ                       "
[128] "OΡΕΟΙ                                   "

once I fixed the bizarrely missing comma and quote mark in your "dput" data (edited your question for you). 一旦我在您的“ dput”数据中修复了奇异的逗号和引号(为您编辑了问题)。

I've written a convenient, general, and internally slightly wonky function in base R that works well for this purpose. 我已经在base R中编写了一个方便,通用且内部有点不稳定的函数,可以很好地实现此目的。 Here it is: 这里是:

dsub <- function(input,re,f=function(s,d) paste0(s,c(if (length(d)==0L) NULL else paste0('<',d,'>'),''),collapse='')) {
    splits <- strsplit(input,re,perl=T);
    delims <- lapply(strsplit(gsub(paste0('(',re,')'),'.\\1',input,perl=T),paste0('\\.(?=',re,')'),perl=T),function(x) sub(paste0('^(',re,').*'),'\\1',x[-1],perl=T))
    lapply(1:length(splits), function(i) { s <- splits[[i]]; d <- delims[[i]]; f(c(s,if (length(s)==length(d)) '' else NULL),d); } );
};

The idea behind the function is to provide a more powerful variation of strsplit() which allows you to not just split the input strings into fields, but have a lambda called once for each input string, which takes both the field list (I called it splits or s in my code) and the delimiters that delimited each field (called it delims or d ). 该函数背后的想法是提供一种更强大的strsplit()变体,它使您不仅可以将输入字符串分割成多个字段,而且还可以为每个输入字符串调用一次lambda,这将同时占用两个字段列表(我称之为在我的代码中splitss )以及用于分隔每个字段的定界符(称为delimsd )。

Importantly, the final field is never delimited, so s is always one element longer than d . 重要的是,最终字段永远不会定界,因此s总是比d长一个元素。 Now, it should be noted that that's not how strsplit() normally behaves; 现在,应该注意的是, strsplit()正常行为并非如此。 it actually doesn't return a final empty string field if the final delimiter in the input string comprised the end of said string, but I've sort of "patched" that behavior in my dsub() function for the sake of consistency; 如果输入字符串中的最后定界符包含所述字符串的末尾,它实际上不会返回最终的空字符串字段,但是为了保持一致性,我在dsub()函数中对该行为进行了“修补”。 for every call to the lambda f() , it is guaranteed that s will be one element longer than d . 对于每次对lambda f()调用,都可以确保sd长一个元素。

A second quirk is related to the way I extracted the delimiters; 第二个怪癖与我提取定界符的方式有关。 that was easier said than done. 说起来容易做起来难。 I used strsplit() again, but made the regex a zero-width lookahead assertion to preserve the delimiter content, and then, after the split, I called sub() to strip off everything after the delimiter. 我再次使用strsplit() ,但使正则表达式成为零宽度的超前断言,以保留分隔符的内容,然后在拆分后,我调用sub()剥离分隔符后的所有内容。 Now, strsplit() behaves weirdly when you use an entirely zero-width regex that matches multiple characters; 现在,当您使用一个完全零宽度的正则表达式来匹配多个字符时, strsplit()行为很奇怪。 I think what happens is that it matches the regex twice in the same spot, and then splits up the first and subsequent characters across adjacent returned fields. 我认为发生的事情是它在同一位置两次匹配了正则表达式,然后在相邻的返回字段之间拆分了第一个和后续字符。 To solve that I added a dummy char before every instance of the delimiter and then matched that char (non-zero-width, just prior to the lookahead assertion) as part of the split regex, which naturally strips it off. 为了解决这个问题,我在分隔符的每个实例之前添加了一个哑字符,然后将该字符(非零宽度,恰好在超前断言之前)匹配为拆分正则表达式的一部分,自然可以将其剥离。

Anyway, here's a simple demo that shows how you can use dsub() with the lambda calling intToUtf8() to do this kind of "Unicode interpolation": 无论如何,这是一个简单的演示,展示了如何使用dsub()和lambda调用intToUtf8()来进行这种“ Unicode内插”:

input <- c('Luc TR<U+00c9>HAN','aa<U+00ca>bb<U+00cb>cc','<U+00CC><U+00Cd>','','  ');
re <- '<U\\+([0-9a-fA-F]{4})>';
f <- function(s,d) paste0(s,c(if (length(d)==0L) NULL else intToUtf8(paste0('0x',sub(re,'\\1',d)),multiple=T),''),collapse='');
do.call(c,dsub(input,re,f));
## [1] "Luc TRÉHAN" "aaÊbbËcc"   "ÌÍ"         ""           "  "

And using the rather extensive example data given in this question: 并使用此问题中给出的相当广泛的示例数据:

input <- c("<U+0391>G<U+03A1><U+0399><..."); ## (excerpted)
do.call(c,dsub(input,re,f));
##   [1] "ΑGΡΙΝΙΟ                                 " "ΑGΧΙΑΛΟS                                " "ΑΙGΙΝΑ                                  " "ΑΙGΙΟ                                   "
##   [5] "ΑΙΔΗΨΟS                                 " "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)                          " "ΑΛΕΞΑΝΔΡΟΥΠΟΛΗ                          " "ΑΛΙΑΡΤΟS                                "
##   [9] "ΑΝΑΒΡΥΤΑ                                " "ΑΝΔΡΑΒΙΔΑ                               " "ΑΝOGΕΙΑ                                 " "ΑΡΑΞΟS                                  "
##  [13] "ΑΡΑΧOΒΑ                                 " "ΑΡGΟS(ΠΥΡGΕΛΑ)                          " "ΑΡGΟSΤΟΛΙ                               " "ΑΡΤΑ (ΠΟΛΗ)                             "
##  [17] "ΑΡΤΑ (FΙΛΟTΕΗ)                          " "ΑSΤΕΡΟSΚΟΠΕΙΟ                           " "ΑSΤΡΟS                                  " "ΑSΤΥΠΑΛΑΙΑ                              "
##  [21] "ΒΑΜΟS                                   " "ΒΕΛΟ (ΚΟΡΙΝTΙΑS)                        " "ΒΟΛΟS                                   " "ΒΥΤΙΝΑ                                  "
##  [25] "GΟΡΤΥS                                  " "GΥTΕΙΟ                                  " "ΔΕSFΙΝΑ                                 " "ΔΙΑΒΟΛΙΤSΙ                              "
##  [29] "ΔΟΜΟΚΟS                                 " "ΔΡΑΜΑ                                   " "ΕΔΕSSΑ                                  " "ΕΛΕΥSΙΝΑ                                "
##  [33] "ΕΛΛΗΝΙΚΟ aeρ                            " "ΖΑΚΥΝTΟS                                " "ΖΑΚΥΝTΟS_ΠΟΛΗ                           " "ΖΑΡΟS                                   "
##  [37] "ΗΡΑΚΛΕΙΟ                                " "TΑSΟS                                   " "TΗΡΑ (SΑΝΤΟΡΙΝΗ"                          "ΙΕΡΑΠΕΤΡΑ                               "
##  [41] "ΙΚΑΡΙΑ_Α/Δ                              " "ΙOΑΝΝΙΝΑ                                " "ΚΑΒΑΛΑ (ΠΟΛΗ)                           " "ΚΑΒΑΛΑ(ΑΜΥGΔΑΛΕOΝΑS)                    "
##  [45] "ΚΑΛΑΒΡΥΤΑ                               " "ΚΑΛΑΜΑΤΑ                                " "ΚΑΛΑΜΠΑΚΑ                               " "ΚΑΡΔΙΤSΑ                                "
##  [49] "ΚΑΡΠΑTΟS_Α/Δ                            " "ΚΑΡΠΑTΟS_ΠΟΛΗ                           " "ΚΑΡΠΕΝΗSΙ                               " "ΚΑΡΥSΤΟS                                "
##  [53] "ΚΑSΟS                                   " "ΚΑSΤΕΛΛΙ                                " "ΚΑSΤΟΡΙΑ                                " "ΚΕΡΚΥΡΑ                                 "
##  [57] "ΚΟΖΑΝΗ                                  " "ΚΟΜΟΤΗΝΗ                                " "ΚΟΝΙΤSΑ                                 " "ΚΟΡΙΝTΟS                                "
##  [61] "ΚΥTΗΡΑ_Α/Δ                              " "ΚΥΜΗ                                    " "ΚOS                                     " "ΚOS_ΠΟΛΗ                                "
##  [65] "ΛΑΜΙΑ                                   " "ΛΑΡΙSΑ                                  " "ΛΕΡΟS                                   " "ΛΕΥΚΑΔΑ (ΝΗSΙ)                          "
##  [69] "ΛΕOΝΙΔΙΟ                                " "ΛΗΜΝΟS                                  " "ΛΙΔOΡΙΚΙ                                " "ΜΑΚΕΔΟΝΙΑ                               "
##  [73] "ΜΑΡΑTOΝΑS                               " "ΜΕTOΝΗ                                  " "ΜΕSΟΛΟGGΙ                               " "ΜΗΛΟS_ΑΜS                               "
##  [77] "ΜΥΚΟΝΟS                                 " "ΜΥΤΙΛΗΝΗ                                " "ΝΑΞΟS                                   " "ΝΑΥΠΑΚΤΟS                               "
##  [81] "ΝΑΥΠΛΙΟ                                 " "ΝΕΑ FΙΛΑΔΕΛFΕΙΑ                         " "ΞΑΝTΗ                                   " "ΟΡΕSΤΙΑΔΑ                               "
##  [85] "ΠΑΙΑΝΙΑ                                 " "ΠΑΛΑΙΟΧOΡΑ                              " "ΠΑΡΟS_Α/Δ                               " "ΠΑΤΡΑ                                   "
##  [89] "ΠΕΙΡΑΙΑS                                " "ΠΟΛΥGΥΡΟS                               " "ΠΟΤΙΔΑΙΑ                                " "ΠΤΟΛΕΜΑΙΔΑ                              "
##  [93] "ΠΥΡGΟS                                  " "ΡΑFΗΝΑ                                  " "ΡΕTΥΜΝΟ                                 " "ΡΟΔΟS                                   "
##  [97] "SΑΜΟS                                   " "SΕΔΕS                                   " "SΕΡΡΕS                                  " "SΗΤΕΙΑ                                  "
## [101] "SΚΙΑTΟS                                 " "SΚΟΤΙΝΑ                                 " "SΚΥΡΟS                                  " "SΟΥΔΑ                                   "
## [105] "SΟΥFΛΙ                                  " "SΠΑΡΤΗ                                  " "SΠΑΤΑ(ΒΕΝΙΖΕΛΟS)                        " "SΠΕΤSΕS                                 "
## [109] "SΤΕFΑΝΙ (ΚΟΡΙΝTΙΑS)                     " "SΥΚΥOΝΑ                                 " "SΥΡΟS_Α/Δ                               " "ΤΑΝΑGΡΑ                                 "
## [113] "ΤΑΤΟΙ (ΔΕΚΕΛΕΙΑ)                        " "ΤΖΕΡΜΙΑΔΕS                              " "ΤΡΙΚΑΛΑ ΗΜΑTΕΙΑS                        " "ΤΡΙΚΑΛΑ TΕSSΑΛΙΑS                       "
## [117] "ΤΡΙΠΟΛΗ                                 " "ΤΥΜΠΑΚΙ                                 " "ΤΥΡΙΝTΑ                                 " "FΑΡSΑΛΑ                                 "
## [121] "FΛOΡΙΝΑ                                 " "FΟΥΡΝΗ                                  " "FΥΧΤΙΑ                                  " "ΧΑΛΚΙΔΑ                                 "
## [125] "ΧΑΝΙΑ                                   " "ΧΙΟS                                    " "ΧΡΥSΟΥΠΟΛΗ_ΚΑΒΑΛΑ                       " "OΡΕΟΙ                                   "

Here's another alternative for recovering the true uncode character encoded in a string (borrowed from this question ). 这是恢复字符串中编码的真正非编码字符的另一种选择(从此问题中借用)。 Here we carefully match the form <U+[hex]> and unroll that hex value into a properly sized unicode character with some bit manipulation. 在这里,我们仔细匹配形式<U+[hex]>并通过一些位操作将十六进制值展开为适当大小的unicode字符。

trueunicode <- function(x) {
    packuni<-Vectorize(function(cp) {
        bv <- intToBits(cp)
        maxbit <- tail(which(bv!=as.raw(0)),1)
        if(maxbit < 8) {
            rawToChar(as.raw(codepoint))
        } else if (maxbit < 12) {
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:11], as.raw(c(0,1,1))), "raw")))
        } else if (maxbit < 17){
            rawToChar(rev(packBits(c(bv[1:6], as.raw(c(0,1)), bv[7:12], as.raw(c(0,1)), bv[13:16], as.raw(c(0,1,1,1))), "raw")))    
        } else {
           stop("too many bits")
        }
    })
    m <- gregexpr("<U\\+[0-9a-fA-F]{4}>", x)
    codes <- regmatches(x,m)
    chars <- lapply(codes, function(x) {
        codepoints <- strtoi(paste0("0x", substring(x,4,7)))
        packuni(codepoints)

    })
    regmatches(x,m) <- chars
    Encoding(x)<-"UTF-8"
    x
}

using the sample 使用样本

input <- c("<U+0391>G<U+03A1><U+0399><U+039D><U+0399><U+039F>", "<U+0391>G<U+03A7><U+0399><U+0391><U+039B><U+039F>S","<U+0391><U+0399>G<U+0399><U+039D><U+0391>", "<U+0391><U+0399>G<U+0399><U+039F>", "<U+0391><U+0399><U+0394><U+0397><U+03A8><U+039F>S","<U+0391><U+039A><U+03A4><U+0399><U+039F>(<U+03A0><U+03A1><U+0395><U+0392><U+0395><U+0396><U+0391>)")

you get 你得到

trueunicode(input)
# [1] "ΑGΡΙΝΙΟ"        "ΑGΧΙΑΛΟS"       "ΑΙGΙΝΑ"         "ΑΙGΙΟ"         
# [5] "ΑΙΔΗΨΟS"        "ΑΚΤΙΟ(ΠΡΕΒΕΖΑ)"

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM