简体   繁体   中英

How can I split a long continuous string into an array of the words it contains?

I have a long continous string that looks something like this:

let myString = "onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteen";

It does not have any separators to easily target.
So how can I itrate over it and split the words so it ends up like:

splitString = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen"];

Preferably with JavaScript.

The problem here is the lack of separators as you have mentioned - this makes it impossible for the software to know where the words begin and end.

Given that you know the words that will show up, my technique would be so:

NOTE: This does not take into account the possibility of overlapping words and assumes none of the words are possible subsets of other words...

  1. Iterate the known words
  2. Search (indexOf) the string for each known word and note down it's positions in the string
  3. Sort the the values by the index values
  4. Generate an array with the values contained in the order found

/**
 * This assumes that:
 *  - Input words are not subsets of other input words
 */

// Find all indices of the input word in the input String
function findAll(inputString, inputWord) {
    const indices = [];
    let index = 0;
    while (index < inputString.length) {
        index = inputString.indexOf(inputWord, index);
        if (index == -1) break; // -1 means not found so we break here
        indices.push({ index, word: inputWord });
        index += inputWord.length;
    }
    return indices;
}

// Split the words into an array of Objects holding their positions and values
function splitWords(inputString, inputWords) {
    // For holding the results
    let results = [];
    // Loop the input words
    for (const inputWord of inputWords) {
        // Find the indices and concat to the results array
        results = results.concat(findAll(inputString, inputWord));
    }
    return results;
}

// Sort the words and return just an array of Strings
const orderWords = (inputArr) => inputArr.sort((a, b) => a.index - b.index).map(input => input.word);

/**
 * Usage like so:
 */
const myString = 'onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteen';
const inputWords = ["one", "two", "three","four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen"];

const result = splitWords(myString, inputWords);
const ordered = orderWords(result);

console.dir(ordered);

/**
 * Result:
    [
    'one',      'two',
    'three',    'four',
    'five',     'six',
    'seven',    'eight',
    'nine',     'ten',
    'eleven',   'twelve',
    'thirteen', 'four',
    'fourteen'
    ]
 */

If as you said in the comments that you know the expected words then create an array of these words and loop through your string to find these words

note the bellow code takes into account the length of the matched words so that you can find words such as one hundred eighty five otherwise the loop stops when it finds one

you can read the comments in the code to better understand it

 // your string var myString = "onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteentwentyfiveonehundredeightyfiveeightyfive"; // the list of expected words var possibleWords = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "twenty five", "one hundred eighty five", "eighty five", ]; function separateString(mergedString, possibleWords) { // the resulted array that has all the splited words var result = []; // buffer to temporary store the string and match it with the expected words array var buffer = ""; // The word that has been matched in buffer with possible word in expected words array var matchedWord = ""; // Index if the matched word var matchedWordLastIndex = -1; // Converting your string into array so we can access it by index letter by letter var splitedString = mergedString.split(""); // For every letter in your string for (var stringIndex = 0; stringIndex < splitedString.length; stringIndex++) { // Resetting the variables matchedWord = ""; buffer = ""; matchedWordLastIndex = -1; // Look a head from current string index to the end of your string and find every word that matches with expected words for ( var lookAhead = stringIndex; lookAhead < splitedString.length; lookAhead++) { // Append letters with each iteration of look ahead with the buffer so we can make words from it buffer += splitedString[lookAhead]; // loop through expected words to find a match with buffer for (var i = 0; i < possibleWords.length; i++) { // if buffer is equal to a word in expected words array: .replace(/ /g, '') removes space if the words inside expected array of words have space such as twenty five to twentyfive if (buffer == possibleWords[i].replace(/ /g, '')) { // check if the found word has more letters than the previouse matched word so we can find words like one hundred eighty five otherwise it will just find one and stops if(matchedWord.length < buffer.length) { // if the word has more letters then put the word into matched word and store the look ahead index into matchedWordLastIndex matchedWord = possibleWords[i]; matchedWordLastIndex = lookAhead; } } } } // if a word has been found if(matchedWord.length > 0){ // make starting index same as look ahead index since last word found ended there stringIndex = matchedWordLastIndex; // put the found word into result array result.push(matchedWord); } } return result; } console.log(separateString(myString, possibleWords));

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM