[英]How to convert characters to HTML entities using plain JavaScript
I have the following:我有以下几点:
var text = "Übergroße Äpfel mit Würmern";
I'm searching for a Javascript function to transform the text so that every special letter is represented by its HTML entity sequence like this:我正在寻找一个 Javascript 函数来转换文本,以便每个特殊字母都由其 HTML 实体序列表示,如下所示:
var newText = magicFunction(text);
...
newText = "Übergroße Äpfel mit Würmern";
The function should not only escape the letters of this example but also all of these.该函数不仅应转义此示例中的字母,还应转义所有这些字母。
How would you achieve that?你会如何做到这一点? Is there any existing function out there?
有没有现有的功能? (Plain, because a solution without a framework is preferred)
(简单,因为没有框架的解决方案是首选)
Btw: Yes, I've seen this question but it doesn't address my need.顺便说一句:是的,我见过这个问题,但它没有解决我的需要。
With the help of bucabay and the advice to create my own function i created this one which works for me.在 bucabay 的帮助和创建我自己的函数的建议下,我创建了这个对我有用的函数。 What do you guys think, is there a better solution somewhere?
大家怎么看,有没有更好的解决方案?
if(typeof escapeHtmlEntities == 'undefined') {
escapeHtmlEntities = function (text) {
return text.replace(/[\u00A0-\u2666<>\&]/g, function(c) {
return '&' +
(escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0)) + ';';
});
};
// all HTML4 entities as defined here: http://www.w3.org/TR/html4/sgml/entities.html
// added: amp, lt, gt, quot and apos
escapeHtmlEntities.entityTable = {
34 : 'quot',
38 : 'amp',
39 : 'apos',
60 : 'lt',
62 : 'gt',
160 : 'nbsp',
161 : 'iexcl',
162 : 'cent',
163 : 'pound',
164 : 'curren',
165 : 'yen',
166 : 'brvbar',
167 : 'sect',
168 : 'uml',
169 : 'copy',
170 : 'ordf',
171 : 'laquo',
172 : 'not',
173 : 'shy',
174 : 'reg',
175 : 'macr',
176 : 'deg',
177 : 'plusmn',
178 : 'sup2',
179 : 'sup3',
180 : 'acute',
181 : 'micro',
182 : 'para',
183 : 'middot',
184 : 'cedil',
185 : 'sup1',
186 : 'ordm',
187 : 'raquo',
188 : 'frac14',
189 : 'frac12',
190 : 'frac34',
191 : 'iquest',
192 : 'Agrave',
193 : 'Aacute',
194 : 'Acirc',
195 : 'Atilde',
196 : 'Auml',
197 : 'Aring',
198 : 'AElig',
199 : 'Ccedil',
200 : 'Egrave',
201 : 'Eacute',
202 : 'Ecirc',
203 : 'Euml',
204 : 'Igrave',
205 : 'Iacute',
206 : 'Icirc',
207 : 'Iuml',
208 : 'ETH',
209 : 'Ntilde',
210 : 'Ograve',
211 : 'Oacute',
212 : 'Ocirc',
213 : 'Otilde',
214 : 'Ouml',
215 : 'times',
216 : 'Oslash',
217 : 'Ugrave',
218 : 'Uacute',
219 : 'Ucirc',
220 : 'Uuml',
221 : 'Yacute',
222 : 'THORN',
223 : 'szlig',
224 : 'agrave',
225 : 'aacute',
226 : 'acirc',
227 : 'atilde',
228 : 'auml',
229 : 'aring',
230 : 'aelig',
231 : 'ccedil',
232 : 'egrave',
233 : 'eacute',
234 : 'ecirc',
235 : 'euml',
236 : 'igrave',
237 : 'iacute',
238 : 'icirc',
239 : 'iuml',
240 : 'eth',
241 : 'ntilde',
242 : 'ograve',
243 : 'oacute',
244 : 'ocirc',
245 : 'otilde',
246 : 'ouml',
247 : 'divide',
248 : 'oslash',
249 : 'ugrave',
250 : 'uacute',
251 : 'ucirc',
252 : 'uuml',
253 : 'yacute',
254 : 'thorn',
255 : 'yuml',
402 : 'fnof',
913 : 'Alpha',
914 : 'Beta',
915 : 'Gamma',
916 : 'Delta',
917 : 'Epsilon',
918 : 'Zeta',
919 : 'Eta',
920 : 'Theta',
921 : 'Iota',
922 : 'Kappa',
923 : 'Lambda',
924 : 'Mu',
925 : 'Nu',
926 : 'Xi',
927 : 'Omicron',
928 : 'Pi',
929 : 'Rho',
931 : 'Sigma',
932 : 'Tau',
933 : 'Upsilon',
934 : 'Phi',
935 : 'Chi',
936 : 'Psi',
937 : 'Omega',
945 : 'alpha',
946 : 'beta',
947 : 'gamma',
948 : 'delta',
949 : 'epsilon',
950 : 'zeta',
951 : 'eta',
952 : 'theta',
953 : 'iota',
954 : 'kappa',
955 : 'lambda',
956 : 'mu',
957 : 'nu',
958 : 'xi',
959 : 'omicron',
960 : 'pi',
961 : 'rho',
962 : 'sigmaf',
963 : 'sigma',
964 : 'tau',
965 : 'upsilon',
966 : 'phi',
967 : 'chi',
968 : 'psi',
969 : 'omega',
977 : 'thetasym',
978 : 'upsih',
982 : 'piv',
8226 : 'bull',
8230 : 'hellip',
8242 : 'prime',
8243 : 'Prime',
8254 : 'oline',
8260 : 'frasl',
8472 : 'weierp',
8465 : 'image',
8476 : 'real',
8482 : 'trade',
8501 : 'alefsym',
8592 : 'larr',
8593 : 'uarr',
8594 : 'rarr',
8595 : 'darr',
8596 : 'harr',
8629 : 'crarr',
8656 : 'lArr',
8657 : 'uArr',
8658 : 'rArr',
8659 : 'dArr',
8660 : 'hArr',
8704 : 'forall',
8706 : 'part',
8707 : 'exist',
8709 : 'empty',
8711 : 'nabla',
8712 : 'isin',
8713 : 'notin',
8715 : 'ni',
8719 : 'prod',
8721 : 'sum',
8722 : 'minus',
8727 : 'lowast',
8730 : 'radic',
8733 : 'prop',
8734 : 'infin',
8736 : 'ang',
8743 : 'and',
8744 : 'or',
8745 : 'cap',
8746 : 'cup',
8747 : 'int',
8756 : 'there4',
8764 : 'sim',
8773 : 'cong',
8776 : 'asymp',
8800 : 'ne',
8801 : 'equiv',
8804 : 'le',
8805 : 'ge',
8834 : 'sub',
8835 : 'sup',
8836 : 'nsub',
8838 : 'sube',
8839 : 'supe',
8853 : 'oplus',
8855 : 'otimes',
8869 : 'perp',
8901 : 'sdot',
8968 : 'lceil',
8969 : 'rceil',
8970 : 'lfloor',
8971 : 'rfloor',
9001 : 'lang',
9002 : 'rang',
9674 : 'loz',
9824 : 'spades',
9827 : 'clubs',
9829 : 'hearts',
9830 : 'diams',
338 : 'OElig',
339 : 'oelig',
352 : 'Scaron',
353 : 'scaron',
376 : 'Yuml',
710 : 'circ',
732 : 'tilde',
8194 : 'ensp',
8195 : 'emsp',
8201 : 'thinsp',
8204 : 'zwnj',
8205 : 'zwj',
8206 : 'lrm',
8207 : 'rlm',
8211 : 'ndash',
8212 : 'mdash',
8216 : 'lsquo',
8217 : 'rsquo',
8218 : 'sbquo',
8220 : 'ldquo',
8221 : 'rdquo',
8222 : 'bdquo',
8224 : 'dagger',
8225 : 'Dagger',
8240 : 'permil',
8249 : 'lsaquo',
8250 : 'rsaquo',
8364 : 'euro'
};
}
usage example:用法示例:
var text = "Übergroße Äpfel mit Würmern";
alert(escapeHtmlEntities (text));
result:结果:
Übergroße Äpfel mit Würmern
Update1: Thanks bucabay again for the ||更新 1:再次感谢bucabay的 || - hint
- 暗示
Update2: Updated entity table with amp,lt,gt,apos,quot, thanks richardtallent for the hint更新 2 :使用 amp、lt、gt、apos、quot 更新实体表,感谢richardtallent的提示
Update3(in 2014): Mathias Bynens created a lib called 'he' , maybe it serves your need. Update3(2014 年): Mathias Bynens创建了一个名为 'he' 的库,也许它可以满足您的需求。
All the other solutions suggested here, as well as most other JavaScript libraries that do HTML entity encoding/decoding, make several mistakes:这里建议的所有其他解决方案,以及大多数其他执行 HTML 实体编码/解码的 JavaScript 库,都会犯一些错误:
htmlDecode('≼')
should return '≼'
( ie '\≼'
).htmlDecode('≼')
应该返回'≼'
(即'\≼'
)。htmlEncode('𝌆')
should return something like 𝌆
htmlEncode('𝌆')
应该返回类似𝌆
or 𝌆
𝌆
. ��
or ��
), it is broken.��
或��
),它就会被破坏。htmlDecode('𝌆')
should return '𝌆'
and not '팆'
( ie '\팆'
). htmlDecode('𝌆')
应该返回'𝌆'
而不是'팆'
(即'\팆'
)。htmlDecode('€')
should return '€'
( ie '\€'
).htmlDecode('€')
应该返回'€'
(即'\€'
)。htmlDecode('&amp;')
should return '&'
htmlDecode('&amp;')
应该返回'&'
, not &
. &
。 For a robust solution that avoids all these issues, use a library I wrote called he for this.对于避免所有这些问题的强大解决方案,请使用我为此编写的名为he的库。 From its README:
从它的自述文件:
he (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript.
he (代表“HTML 实体”)是一个用 JavaScript 编写的强大的 HTML 实体编码器/解码器。 It supports all standardized named character references as per HTML , handles ambiguous ampersands and other edge cases just like a browser would , has an extensive test suite, and — contrary to many other JavaScript solutions — he handles astral Unicode symbols just fine.
它支持所有标准化的 HTML 命名字符引用, 像浏览器一样处理模糊的&符号和其他边缘情况,具有广泛的测试套件,并且——与许多其他 JavaScript 解决方案相反——他可以很好地处理星体 Unicode 符号。 An online demo is available.
提供在线演示。
Using escape() should work with the character code range 0x00 to 0xFF ( UTF-8 range ).使用 escape() 应该适用于字符代码范围 0x00 到 0xFF( UTF-8 范围)。
If you go beyond 0xFF (255), such as 0x100 (256) then escape() will not work:如果超出 0xFF (255),例如 0x100 (256),则 escape() 将不起作用:
escape("\u0100"); // %u0100
and:和:
text = "\u0100"; // Ā
html = escape(text).replace(/%(..)/g,"&#x$1;"); // &#xu0;100
So, if you want to cover all Unicode charachacters as defined on http://www.w3.org/TR/html4/sgml/entities.html , then you could use something like:因此,如果您想涵盖http://www.w3.org/TR/html4/sgml/entities.html 上定义的所有 Unicode 字符,那么您可以使用以下内容:
var html = text.replace(/[\u00A0-\u00FF]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
Note here the range is between: \ -\ÿ.注意这里的范围是:\ -\ÿ。
Thats the first character code range defined in http://www.w3.org/TR/html4/sgml/entities.html which is the same as what escape() covers.这是http://www.w3.org/TR/html4/sgml/entities.html中定义的第一个字符代码范围,与 escape() 涵盖的内容相同。
You'll need to add the other ranges you want to cover as well, or all of them.您还需要添加您想要涵盖的其他范围,或所有范围。
Example : UTF-8 range with general punctuations (\ -\ÿ and \•-\ℵ)示例:带有一般标点符号的 UTF-8 范围(\ -\ÿ 和 \•-\ℵ)
var html = text.replace(/[\u00A0-\u00FF\u2022-\u2135]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
Edit:编辑:
BTW: \ -\♦ should convert every Unicode character code not within ASCII range to HTML entities blindly:顺便说一句:\ -\♦ 应该盲目地将不在 ASCII 范围内的每个 Unicode 字符代码转换为 HTML 实体:
var html = text.replace(/[\u00A0-\u2666]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
The he library is the only 100% reliable solution that I know of! he库是我所知道的唯一 100% 可靠的解决方案!
He is written by Mathias Bynens - one of the world's most renowned JavaScript gurus - and has the following features :他由世界最著名的 JavaScript 大师之一Mathias Bynens编写,具有以下特点:
he.encode('foo © bar ≠ baz 𝌆 qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'
he.decode('foo © bar ≠ baz 𝌆 qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'
You can use:您可以使用:
function encodeHTML(str){
var aStr = str.split(''),
i = aStr.length,
aRet = [];
while (i--) {
var iC = aStr[i].charCodeAt();
if (iC < 65 || iC > 127 || (iC>90 && iC<97)) {
aRet.push('&#'+iC+';');
} else {
aRet.push(aStr[i]);
}
}
return aRet.reverse().join('');
}
This function HTMLEncodes everything that is not az/AZ.此函数 HTMLEncodes 不是 z/AZ 的所有内容。
[ Edit ] A rather old answer. [编辑]一个相当古老的答案。 Let's add a simpler String extension to encode all extended characters:
让我们添加一个更简单的 String 扩展来对所有扩展字符进行编码:
String.prototype.encodeHTML = function () {
return this.replace(/[\u0080-\u024F]/g,
function (v) {return '&#'+v.charCodeAt()+';';}
);
}
// usage
log('Übergroße Äpfel mit Würmern'.encodeHTML());
//=> 'Übergroße Äpfel mit Würmern'
Having a lookup table with a bazillion replace() calls is slow and not maintainable.拥有一个包含大量 replace() 调用的查找表很慢且不可维护。
Fortunately, the build-in escape() function also encodes most of the same characters, and puts them in a consistent format (%XX, where XX is the hex value of the character).幸运的是,内置的escape()函数也对大部分相同的字符进行编码,并将它们置于一致的格式中(%XX,其中XX 是字符的十六进制值)。
So, you can let escape() method do most of the work for you and just change its answer to be HTML entities instead of URL-escaped characters:因此,您可以让 escape() 方法为您完成大部分工作,只需将其答案更改为 HTML 实体而不是 URL 转义字符:
htmlescaped = escape(mystring).replace(/%(..)/g,"&#x$1;");
This uses the hex format for escaping values rather than the named entities, but for storing and displaying the values, it works just as well as named entities.这使用十六进制格式来转义值而不是命名实体,但对于存储和显示值,它与命名实体一样有效。
Of course, escape also escapes characters you don't need to escape in HTML (spaces, for instance), but you can unescape them with a few replace calls.当然,也逃脱逃脱你不需要在HTML逃脱(空格,例如)字符,但你可以用一些替代的呼叫反转义它们。
Edit: I like bucabay's answer better than my own... handles a larger range of characters, and requires no hacking afterward to get spaces, slashes, etc. unescaped.编辑:我比我自己的更喜欢 bucabay 的答案......处理更大范围的字符,并且之后不需要黑客来获得空格、斜线等。未转义。
Demo on JSFiddle在 JSFiddle 上演示
here's a tiny stand alone method that:这是一个很小的独立方法:
i don't know too much about unicode, but it seems to be working well.我不太了解 unicode,但它似乎运行良好。
// escape a string for display in html
// see also:
// polyfill for String.prototype.codePointAt
// https://raw.githubusercontent.com/mathiasbynens/String.prototype.codePointAt/master/codepointat.js
// how to convert characters to html entities
// http://stackoverflow.com/a/1354491/347508
// html overrides from
// https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides / http://stackoverflow.com/questions/1354064/how-to-convert-characters-to-html-entities-using-plain-javascript/23831239#comment36668052_1354098
var _escape_overrides = { 0x00:'\uFFFD',0x80:'\u20AC',0x82:'\u201A',0x83:'\u0192',0x84:'\u201E',0x85:'\u2026',0x86:'\u2020',0x87:'\u2021',0x88:'\u02C6',0x89:'\u2030',0x8A:'\u0160',0x8B:'\u2039',0x8C:'\u0152',0x8E:'\u017D',0x91:'\u2018',0x92:'\u2019',0x93:'\u201C',0x94:'\u201D',0x95:'\u2022',0x96:'\u2013',0x97:'\u2014',0x98:'\u02DC',0x99:'\u2122',0x9A:'\u0161',0x9B:'\u203A',0x9C:'\u0153',0x9E:'\u017E',0x9F:'\u0178' };
function escapeHtml(str){
return str.replace(/([\u0000-\uD799]|[\uD800-\uDBFF][\uDC00-\uFFFF])/g, function(c) {
var c1 = c.charCodeAt(0);
// ascii character, use override or escape
if( c1 <= 0xFF ) return (c1=_escape_overrides[c1])?c1:escape(c).replace(/%(..)/g,"&#x$1;");
// utf8/16 character
else if( c.length == 1 ) return "&#" + c1 + ";";
// surrogate pair
else if( c.length == 2 && c1 >= 0xD800 && c1 <= 0xDBFF ) return "&#" + ((c1-0xD800)*0x400 + c.charCodeAt(1) - 0xDC00 + 0x10000) + ";"
// no clue ..
else return "";
});
}
I fixed my problem by using encodeURIComponent()
instead of escape()
.我通过使用
encodeURIComponent()
而不是escape()
解决了我的问题。
This might be the fix for you if the problem happens when sending your string in a URL.如果在 URL 中发送字符串时出现问题,这可能是您的解决方法。
Try this with the phrase ("hi & % '")试试这个短语 ("hi & % '")
escape()
returns escape()
返回
"hi%20%26%20%25%20%u2018"
Notice the %u2018
isn't very url friendly and can break the rest of the query string.请注意
%u2018
对 url 不是很友好,可能会破坏查询字符串的其余部分。
encodeURI()
returns encodeURI()
返回
"hi%20&%20%25%20%E2%80%98"
Notice the ampersand is still there.注意&符号仍然存在。
encodeURIComponent()
returns encodeURIComponent()
返回
"hi%20%26%20%25%20%E2%80%98"
Finally, all of our characters are properly encoded.最后,我们所有的字符都被正确编码。
I recommend to use the JS library entities .我建议使用 JS 库实体。 Using the library is quite simple.
使用该库非常简单。 See the examples from docs:
请参阅文档中的示例:
const entities = require("entities");
//encoding
entities.escape("&"); // "&#38;"
entities.encodeXML("&"); // "&#38;"
entities.encodeHTML("&"); // "&#38;"
//decoding
entities.decodeXML("asdf & ÿ ü '"); // "asdf & ÿ ü '"
entities.decodeHTML("asdf & ÿ ü '"); // "asdf & ÿ ü '"
Best solution is posted at phpjs.org implementation of PHP function htmlentities最佳解决方案发布在 phpjs.org implementation of PHP function htmlentities
The format is htmlentities(string, quote_style, charset, double_encode)
Full documentation on the PHP function which is identical can be read here格式是
htmlentities(string, quote_style, charset, double_encode)
关于 PHP 函数的完整文档可以在这里阅读
I adapted one of the answers from the referenced question, but added the ability to define an explicit mapping for character names.我改编了参考问题的答案之一,但添加了为字符名称定义显式映射的功能。
var char_names = {
160:'nbsp',
161:'iexcl',
220:'Uuml',
223:'szlig',
196:'Auml',
252:'uuml',
};
function HTMLEncode(str){
var aStr = str.split(''),
i = aStr.length,
aRet = [];
while (--i >= 0) {
var iC = aStr[i].charCodeAt();
if (iC < 32 || (iC > 32 && iC < 65) || iC > 127 || (iC>90 && iC<97)) {
if(char_names[iC]!=undefined) {
aRet.push('&'+char_names[iC]+';');
}
else {
aRet.push('&#'+iC+';');
}
} else {
aRet.push(aStr[i]);
}
}
return aRet.reverse().join('');
}
var text = "Übergroße Äpfel mit Würmer";
alert(HTMLEncode(text));
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.