[英]extract text from html without loosing new lines
I use the following code to extract text from html.我使用以下代码从 html 中提取文本。
var html = "first line. <div>second line. </div><div>third line.</div><div><br></div>"
var text = extractContent(html);
console.log("TEXT: " + text);
function extractContent(s) {
var span = document.createElement('span');
span.innerHTML = s;
return span.textContent || span.innerText;
}
the result of this code is the text without new lines.这段代码的结果是没有换行的文本。 but I want the result to replace divs with "\\n" like this:但我希望结果用 "\\n" 替换 div,如下所示:
first line."\n"second line. "\n" third line."\n"
Use s.replaceAll(' ', '\\\\n');
使用s.replaceAll(' ', '\\\\n');
to replace the linefeed.替换换行符。
Note: The backslash \\
needs to be escaped with another \\
.注意:反斜杠\\
需要用另一个\\
转义。
var html = "first line. <div>second line. </div><div>third line.</div><div><br></div>" var text = extractContent(html); console.log("TEXT: " + text); function extractContent(s) { var span = document.createElement('span'); s = s.replaceAll(' ', '\\\\n'); span.innerHTML = s; return span.textContent || span.innerText; }
document.createElement
is not necessary, you can use regexp
alone to achieve this: document.createElement
不是必需的,您可以单独使用regexp
来实现这一点:
var html = "first line. <div>second line. </div><div>third line.</div><div><br></div>" var text = extractContent(html); console.log("TEXT: " + text); function extractContent(s) { /* * /(<[^>]+>)+/gim <---- this regexp to match all html tags and replace them with \\n, * also merge empty content html tags into one \\n * */ return s.replace(/(<[^>]+>)+/gim, "\\n"); }
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.