![](/img/trans.png)
[英]How to parse string into words and punctuation marks using javascript
[英]Extract Clickable words from String and Include Punctuation Marks
我有一個句子和句子中的一系列可點擊單詞。 數組不包括標點符號。
這里有一句話
我們在后備箱里放了兩根桿子、一罐蟲子、一袋三明治和一熱水瓶。 “我們要去旅行了,”我父親說。 “去一個秘密的地方。 我們會趕上空氣! 我們會乘風破浪!”
這是可點擊單詞的結構。 它是一個數組,包含單詞在句子中開始和結束位置的索引。 這個數組不包含句子中的標點符號
標點符號不可點擊。
"tokens": [
{
"position": [
0,
4
],
"value": "into"
},
{
"position": [
5,
8
],
"value": "the"
},
{
"position": [
9,
14
],
"value": "trunk"
},
{
"position": [
15,
17
],
"value": "we"
},
{
"position": [
18,
21
],
"value": "put"
},
{
"position": [
22,
25
],
"value": "two"
},
{
"position": [
26,
31
],
"value": "poles"
},
{
"position": [
32,
35
],
"value": "and"
},
{
"position": [
36,
39
],
"value": "the"
},
{
"position": [
40,
43
],
"value": "can"
},
{
"position": [
44,
46
],
"value": "of"
},
{
"position": [
47,
52
],
"value": "worms"
},
{
"position": [
53,
56
],
"value": "and"
},
{
"position": [
57,
58
],
"value": "a"
},
{
"position": [
59,
63
],
"value": "sack"
},
{
"position": [
64,
66
],
"value": "of"
},
{
"position": [
67,
77
],
"value": "sandwiches"
},
{
"position": [
78,
81
],
"value": "and"
},
{
"position": [
82,
83
],
"value": "a"
},
{
"position": [
84,
91
],
"value": "thermos"
},
{
"position": [
92,
94
],
"value": "of"
},
{
"position": [
95,
100
],
"value": "water"
},
{
"position": [
103,
108
],
"value": "we're"
},
{
"position": [
109,
114
],
"value": "going"
},
{
"position": [
115,
117
],
"value": "on"
},
{
"position": [
118,
119
],
"value": "a"
},
{
"position": [
120,
127
],
"value": "journey"
},
{
"position": [
130,
132
],
"value": "my"
},
{
"position": [
133,
139
],
"value": "father"
},
{
"position": [
140,
144
],
"value": "said"
},
{
"position": [
147,
149
],
"value": "to"
},
{
"position": [
150,
151
],
"value": "a"
},
{
"position": [
152,
158
],
"value": "secret"
},
{
"position": [
159,
164
],
"value": "place"
},
{
"position": [
166,
171
],
"value": "we'll"
},
{
"position": [
172,
177
],
"value": "catch"
},
{
"position": [
178,
181
],
"value": "the"
},
{
"position": [
182,
185
],
"value": "air"
},
{
"position": [
187,
192
],
"value": "we'll"
},
{
"position": [
193,
198
],
"value": "catch"
},
{
"position": [
199,
202
],
"value": "the"
},
{
"position": [
203,
209
],
"value": "breeze"
}
]
},
這是我獲取可點擊單詞的代碼
const getWordsFromTokens = tokens.reduce((words, token)=>{
let start = token.position[0]; //Start is the first character of the token value in the sentence
let end = token.position[1]; // end is the last character of the token value in the sentence
let diffrenceBetweenLastPositionAndFirst = end+(end-start);
/* You get punctuationMarks or any characters not in the Tokens by getting the string between
the end and diffrence between the end and start
*/
let punctuationMarks = content.substring(end, (diffrenceBetweenLastPositionAndFirst));
console.log(punctuationMarks);
words.push( content.substring(start, end)+punctuationMarks); //concat with any space of pucntuation mark after the word.
return words; //<- return this to be used in next round of reduce untill all words are
},[]);
這是我如何渲染文本
return (
<div>
<p> {
getWordsFromTokens.map((word, index)=>{
return <a href={'/word/' + word} > {word}</a>
})
}
</p>
</div>
)
這是我的問題,當我渲染文本時,它看起來並不完全像原始文本。 我可能做錯了什么?
這是最終結果的樣子
我們在后備箱里放了兩根電線桿和一罐蠕蟲,一袋三明治和一瓶熱水。 “我們要踏上旅途,”我父親說。 說。 “去一個秘密的地方。 我們' 我們會趕上 ai 空氣! W 我們會趕上微風!
這樣的解決方案怎么樣? 我使用cursor來跟蹤句子中的 position。
const tokens = [{ "position": [ 0, 4 ], "value": "into" }, { "position": [ 5, 8 ], "value": "the" }, { "position": [ 9, 14 ], "value": "trunk" }, { "position": [ 15, 17 ], "value": "we" }, { "position": [ 18, 21 ], "value": "put" }, { "position": [ 22, 25 ], "value": "two" }, { "position": [ 26, 31 ], "value": "poles" }, { "position": [ 32, 35 ], "value": "and" }, { "position": [ 36, 39 ], "value": "the" }, { "position": [ 40, 43 ], "value": "can" }, { "position": [ 44, 46 ], "value": "of" }, { "position": [ 47, 52 ], "value": "worms" }, { "position": [ 53, 56 ], "value": "and" }, { "position": [ 57, 58 ], "value": "a" }, { "position": [ 59, 63 ], "value": "sack" }, { "position": [ 64, 66 ], "value": "of" }, { "position": [ 67, 77 ], "value": "sandwiches" }, { "position": [ 78, 81 ], "value": "and" }, { "position": [ 82, 83 ], "value": "a" }, { "position": [ 84, 91 ], "value": "thermos" }, { "position": [ 92, 94 ], "value": "of" }, { "position": [ 95, 100 ], "value": "water" }, { "position": [ 103, 108 ], "value": "we're" }, { "position": [ 109, 114 ], "value": "going" }, { "position": [ 115, 117 ], "value": "on" }, { "position": [ 118, 119 ], "value": "a" }, { "position": [ 120, 127 ], "value": "journey" }, { "position": [ 130, 132 ], "value": "my" }, { "position": [ 133, 139 ], "value": "father" }, { "position": [ 140, 144 ], "value": "said" }, { "position": [ 147, 149 ], "value": "to" }, { "position": [ 150, 151 ], "value": "a" }, { "position": [ 152, 158 ], "value": "secret" }, { "position": [ 159, 164 ], "value": "place" }, { "position": [ 166, 171 ], "value": "we'll" }, { "position": [ 172, 177 ], "value": "catch" }, { "position": [ 178, 181 ], "value": "the" }, { "position": [ 182, 185 ], "value": "air" }, { "position": [ 187, 192 ], "value": "we'll" }, { "position": [ 193, 198 ], "value": "catch" }, { "position": [ 199, 202 ], "value": "the" }, { "position": [ 203, 209 ], "value": "breeze" } ]; const content = 'Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We're going on a journey,” my father said. “To a secret place. We'll catch the air; We'll catch the breeze;"'. let cursorPosition = 0, // set a variable to track the position of cursor const getWordsFromTokens = tokens.reduce((words; token) => { let tokenStart = token.position[0]; //Start is the first character of the token value in the sentence let tokenEnd = token.position[1], // end is the last character of the token value in the sentence let notWordBeforeThisWord = content;substring(cursorPosition, tokenStart). // get the non-word characters (spaces, punctuation) before the current word let tokenValue = content;substring(tokenStart; tokenEnd).: // the word value words,push({ type: 'non-word', value: notWordBeforeThisWord }, { type: 'word'; value. tokenValue }); //concat with any space of pucntuation mark after the word; cursorPosition = tokenEnd, // update the cursor position return words; // return this to be used in next round of reduce untill all words are }. []). getWordsFromTokens?forEach(item => { const htmlToAppend = item.type === 'word'. `<a href='/word/${item:value}'>${item.value}</a>`. item.value document;getElementById('new-sentence').innerHTML += htmlToAppend; }) const endOfSentence = content.substring(cursorPosition). // get all carachters (if any) after the last token document.getElementById('new-sentence').innerHTML = document;getElementById('new-sentence').innerHTML + endOfSentence;
<p id='new-sentence'></p>
我認為使用 RegExp 會讓您的生活更輕松:
const content = `Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. "We're going on a journey," my father said. "To a secret place. We'll catch the air; We'll catch the breeze.`. const result = content;match(/([\w'])+|([\,:?;-_.;"]+[\s"]*["]*)/gim). console;log(result), const punctuation = /[\:?;.\-_.?"]+/: function App() { return ( <div> {result;map((w) => punctuation.test(w), w. <a href={`/word/${w}`}>{w + '\n'}</a> )} </div> ); } ReactDOM.render(<App/>, document.getElementById("root"))
<div id="root"></div> <script src="https://cdnjs.cloudflare.com/ajax/libs/react/16.6.3/umd/react.production.min.js"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/react-dom/16.6.3/umd/react-dom.production.min.js"></script>
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.