[英]C++ read file word by word without any symbol
I want to read word by word from a text file. 我想从文本文件中逐字阅读。 Here's my code in C++:
这是我在C ++中的代码:
int main(int argc, const char * argv[]) {
// insert code here...
ifstream file("./wordCount.txt");
string word;
while(file >> word){
cout<<word<<endl;
}
return 0;
}
The text file contains the sentence: 文本文件包含以下句子:
I don't have power, but he has power.
Here's the result I get: 这是我得到的结果:
I
don\241\257t
have
power,
but
he
has
power.
Can you tell me how to get the result like the below format: 您能告诉我如何获得如下格式的结果:
I
don't
have
power
but
he
has
power
Thanks. 谢谢。
I understand that you're looking for getting rid of the punctuation. 我了解您正在寻找摆脱标点符号的方法。
Unfortunately, extracting strings from a stream looks only for spaces as separator. 不幸的是,从流中提取字符串仅将空格作为分隔符。 So "don't" or "Hello,world" would be read as one word, and "don' t" or "Hello, world" as two words.
因此,“不要”或“你好,世界”将被理解为一个单词,而“不要”或“你好,世界”将被理解为两个单词。
The alternative, is to read the text line by line, and use string::find_first_of()
to jump from separator to separator: 另一种方法是逐行读取文本,然后使用
string::find_first_of()
从分隔符跳转到分隔符:
string separator{" \t\r\n,.!?;:"};
string line;
string word;
while(getline (cin, line)){ // read line by line
size_t e,s=0; // s = offset of next word, e = end of next word
do {
s = line.find_first_not_of(separator,s); // skip leading separators
if (s==string::npos) // stop if no word left
break;
e=line.find_first_of(separator, s); // find next separator
string word(line.substr(s,e-s)); // construct the word
cout<<word<<endl;
s=e+1; // position after the separator
} while (e!=string::npos); // loop if end of line not reached
}
The code below, gets rid of punctuation, except of the apostrophe: 下面的代码除去撇号之外的标点符号:
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
using namespace std;
int main(int argc, const char * argv[]) {
ifstream file("wordCount.txt");
string word;
while(file >> word) {
for (auto c : word)
if (ispunct(c) && c != '`')
word.erase(word.find_first_of(c));
cout << word << endl;
}
return 0;
}
should produce the desired output: 应该产生期望的输出:
Georgioss-MacBook-Pro:~ gsamaras$ g++ -Wall -std=c++0x main.cpp
Georgioss-MacBook-Pro:~ gsamaras$ ./a.out
I
don`t
have
power
but
he
has
power
For the problem with some characters, I encourage you to check the encoding of the file, so try doing (as explained here ): 对于一些字符的问题,我建议你检查文件的编码,所以尝试做(如解释在这里 ):
file -I wordCount.txt
wordCount.txt: text/plain; charset=us-ascii
which is what worked for me. 这对我有用。 Or simply open a text editor and make sure the characters are valid.
或者直接打开文本编辑器并确保字符有效。
To ease debug, I replace the file with std::istringstream. 为了简化调试,我将文件替换为std :: istringstream。
I also added a bool (class data attribute) to simplify enable/disable of additional diagnostic information. 我还添加了一个布尔值(类数据属性),以简化启用/禁用其他诊断信息的过程。 (m_dbg)
(m_dbg)
#include <algorithm>
#include <chrono>
// 'compressed' chrono access --------------vvvvvvv
typedef std::chrono::high_resolution_clock HRClk_t; // std-chrono-hi-res-clk
typedef HRClk_t::time_point Time_t; // std-chrono-hi-res-clk-time-point
typedef std::chrono::microseconds MS_t; // std-chrono-milliseconds
typedef std::chrono::microseconds US_t; // std-chrono-microseconds
typedef std::chrono::nanoseconds NS_t; // std-chrono-nanoseconds
using namespace std::chrono_literals; // support suffixes like 100ms, 2s, 30us
#include <iostream>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
class T496_t
{
std::array<char, 256> m_keep;
std::vector<std::string> m_wordVec;
bool m_dbg = false;
public:
T496_t()
{
for (uint i=0; i<256; ++i)
m_keep[i] = static_cast<char>(i);
m_keep[uint(',')] = 0;
m_keep[uint('.')] = 0;
}
~T496_t() = default;
int exec()
{
std::istringstream file(
"Hello\n"
"I don't have power, but he has power.\n"
"I don't have power , but he has power.\n"
); //ifstream file("./wordCount.txt");
uint lineCount = 1;
while(1)
{
std::string line;
(void)std::getline(file, line);
if(file.eof())
{
ltrim(line);
if(0 != line.size())
if(m_dbg) std::cout << __LINE__ << " tail: " << line << std::endl;
break;
}
if(m_dbg) std::cout << "\n line " << lineCount++ << " : '"
<< line << "'\n " << std::setfill('-')
<< std::setw(static_cast<int>(line.size())+12)
<< "-" << std::setfill(' ');
std::cout << '\n';
size_t sz = line.size();
if(0 == sz)
continue; // ignore empty lines
extractWordsFrom(line); // extract words
if(file.eof()) break;
}
return(0);
}
private: // methods
void extractWordsFrom(std::string& unfiltered)
{
std::string line; // filtered
filter(unfiltered, line);
if(0 == line.size()) {
if(m_dbg) std::cout << " empty line" << std::endl; return;
}
size_t indx1 = 0;
do {
while(isspace(line[indx1])) { indx1 += 1; } // skip leading spaces
size_t indx2 = line.find(" ", indx1);
if(std::string::npos == indx2)
{
m_wordVec.push_back(line.substr(indx1));
if(m_dbg) std::cout << " word(" << std::setw(3) << indx1 << ", eoln): ";
std::cout << " " << m_wordVec.back() << std::endl;
break;
}
m_wordVec.push_back(line.substr(indx1, indx2-indx1));
if(m_dbg) std::cout << " word(" << std::setw(3) << indx1 << ","
<< std::setw(3) << indx2 << "): ";
std::cout << " " << m_wordVec.back() << std::endl;
indx1 = indx2+1;
}while(1);
}
void filter(std::string& unfiltered, std::string& line)
{
ltrim(unfiltered); // remove leading blanks
for(uint i=0; i<unfiltered.size(); ++i) // transfer all chars
if(m_keep[unfiltered[i]]) // exception check
line.push_back(unfiltered[i]);
}
// trim from start
void ltrim(std::string &s) {
s.erase(s.begin(),
std::find_if(s.begin(), s.end(),
std::not1(std::ptr_fun<int, int>(std::isspace)) ));
}
// trim from end
void rtrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(),
std::not1(std::ptr_fun<int, int>(std::isspace))).base(),s.end());
}
// trim from both ends
void lrtrim(std::string &s) { rtrim(s); ltrim(s); }
}; // class T496_t
int main(int /*argc*/, char** /*argv[]*/)
{
setlocale(LC_ALL, "");
std::ios::sync_with_stdio(false);
Time_t start_us = HRClk_t::now();
int retVal = -1;
{
T496_t t496;
retVal = t496.exec();
}
auto duration_us = std::chrono::duration_cast<US_t>(HRClk_t::now() - start_us);
std::cout << "\n\n FINI " << duration_us.count() << " us" << std::endl;
return(retVal);
} }
// desired output:
// I
// don't
// have
// power
// but
// he
// has
// power
Output from this code: 此代码的输出:
Hello
I
don't
have
power
but
he
has
power
I
don't
have
power
but
he
has
power
Output with m_dbg=true 输出m_dbg = true
line 1 : 'Hello'
-----------------
word( 0, eoln): Hello
line 2 : 'I don't have power, but he has power.'
-------------------------------------------------
word( 0, 1): I
word( 2, 7): don't
word( 8, 12): have
word( 13, 18): power
word( 19, 22): but
word( 23, 25): he
word( 26, 29): has
word( 30, eoln): power
line 3 : 'I don't have power , but he has power.'
---------------------------------------------------
word( 0, 1): I
word( 2, 7): don't
word( 9, 13): have
word( 14, 19): power
word( 21, 24): but
word( 25, 27): he
word( 28, 31): has
word( 32, eoln): power
FINI 215 us
A simple approach is first to filter the string. 一种简单的方法是首先过滤字符串。 Remove any punctuation except apostrophe (ie ' ) and replace them with white-space for further manipulation (ie to take advantage of some built-in functions).
删除除撇号(即')以外的所有标点符号,并用空格替换它们,以进行进一步的操作(即,利用某些内置功能)。
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <sstream>
#include <iterator>
using namespace std;
bool isOk(char c)
{
if ( ispunct(c) )
if ( c == '\'' )
return false;
return ispunct(c);
}
int main()
{
ifstream file("data.txt");
string word;
while(file >> word){
std::replace_if(word.begin(), word.end(), isOk, ' ');
istringstream ss(word);
copy(istream_iterator<string>(ss), istream_iterator<string>(), ostream_iterator<string>(cout, "\n"));
}
return 0;
}
The output is 输出是
I
don't
have
power
but
he
has
power
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.