在不丢失新行的情况下从 html 中提取文本

Question

I use the following code to extract text from html.我使用以下代码从 html 中提取文本。

var html = "first line.&nbsp;<div>second line.&nbsp;</div><div>third line.</div><div><br></div>"
var text = extractContent(html);
console.log("TEXT: " + text);

function extractContent(s) {
   var span = document.createElement('span');
   span.innerHTML = s;
   return span.textContent || span.innerText;
}

the result of this code is the text without new lines.这段代码的结果是没有换行的文本。 but I want the result to replace divs with "\\n" like this:但我希望结果用 "\\n" 替换 div，如下所示：

first line."\n"second line. "\n" third line."\n"

Answer 1

Use s.replaceAll('&nbsp', '\\\\n');使用s.replaceAll('&nbsp', '\\\\n'); to replace the linefeed.替换换行符。
Note: The backslash \\ needs to be escaped with another \\ .注意：反斜杠\\需要用另一个\\转义。

 var html = "first line.&nbsp;<div>second line.&nbsp;</div><div>third line.</div><div><br></div>" var text = extractContent(html); console.log("TEXT: " + text); function extractContent(s) { var span = document.createElement('span'); s = s.replaceAll('&nbsp', '\\\\n'); span.innerHTML = s; return span.textContent || span.innerText; }

Answer 2

document.createElement is not necessary, you can use regexp alone to achieve this: document.createElement不是必需的，您可以单独使用regexp来实现这一点：

 var html = "first line.&nbsp;<div>second line.&nbsp;</div><div>third line.</div><div><br></div>" var text = extractContent(html); console.log("TEXT: " + text); function extractContent(s) { /* * /(<[^>]+>)+/gim <---- this regexp to match all html tags and replace them with \\n, * also merge empty content html tags into one \\n * */ return s.replace(/(<[^>]+>)+/gim, "\\n"); }

在不丢失新行的情况下从 html 中提取文本

问题描述

2 个解决方案

解决方案1
0 2020-09-11 09:41:10

解决方案2
0 2020-09-11 10:24:08

在不丢失新行的情况下从 html 中提取文本

问题描述

2 个解决方案

解决方案1 0 2020-09-11 09:41:10

解决方案2 0 2020-09-11 10:24:08

解决方案1
0 2020-09-11 09:41:10

解决方案2
0 2020-09-11 10:24:08