高效解析mmap文件

Question

以下是使用boost創建內存映射文件的代碼。

boost::iostreams::mapped_file_source file;  
boost::iostreams::mapped_file_params param;  
param.path = "\\..\\points.pts";  //! Filepath  
file.open(param, fileSize);  
if(file.is_open())  
{  
  //! Access the buffer and populate the ren point buffer  
  const char* pData = file.data();  
  char* pData1 = const_cast<char*>(pData);  //! this gives me all the data from Mmap file  
  std::vector<RenPoint> readPoints;  
  ParseData( pData1, readPoints);
}

ParseData的實現如下

void ParseData ( char* pbuffer , std::vector<RenPoint>>& readPoints)    
{
  if(!pbuffer)
throw std::logic_error("no Data in memory mapped file");

stringstream strBuffer;
strBuffer << pbuffer;

//! Get the max number of points in the pts file
std::string strMaxPts;
std::getline(strBuffer,strMaxPts,'\n');
auto nSize = strMaxPts.size();
unsigned nMaxNumPts = GetValue<unsigned>(strMaxPts);
readPoints.clear();

//! Offset buffer 
pbuffer += nSize;
strBuffer << pbuffer;
std::string cur_line;
while(std::getline(strBuffer, cur_line,'\n'))
{
       //! How do I read the data from mmap file directly and populate my renpoint structure    
           int yy = 0;
}

//! Working but very slow
/*while (std::getline(strBuffer,strMaxPts,'\n'))
{
    std::vector<string> fragments;

    istringstream iss(strMaxPts);

    copy(istream_iterator<string>(iss),
        istream_iterator<string>(),
        back_inserter<vector<string>>(fragments));

    //! Logic to populate the structure after getting data back from fragments
    readPoints.push_back(pt);
}*/
}

我已經說過我的數據結構中至少有100萬個點，並且我想優化解析。 有任何想法嗎？

Answer 1

讀入標題信息以獲取點數
在std :: vector中為N * num_points保留空間（N = 3，僅假設X，Y，Z，法線為6，法線和rgb為9）
將文件的其余部分加載到字符串中
將boost :: spirit :: qi :: phrase_parse放入向量中。

//這里的代碼可以在我2歲的Macbook上用大約14秒的時間解析一個具有40M點（> 1GB）的文件：

#include <boost/spirit/include/qi.hpp>
#include <fstream>
#include <vector>

template <typename Iter>
bool parse_into_vec(Iter p_it, Iter p_end, std::vector<float>& vf) {
    using boost::spirit::qi::phrase_parse;
    using boost::spirit::qi::float_;
    using boost::spirit::qi::ascii::space;

    bool ret = phrase_parse(p_it, p_end, *float_, space, vf);
    return p_it != p_end ? false : ret;
}

int main(int argc, char **args) {
    if(argc < 2) {
        std::cerr << "need a file" << std::endl;
        return -1;
    }
    std::ifstream in(args[1]);

    size_t numPoints;
    in >> numPoints;

    std::istreambuf_iterator<char> eos;
    std::istreambuf_iterator<char> it(in);
    std::string strver(it, eos);

    std::vector<float> vf;
    vf.reserve(3 * numPoints);

    if(!parse_into_vec(strver.begin(), strver.end(), vf)) {
        std::cerr << "failed during parsing" << std::endl;
        return -1;
    }

    return 0;
}

Answer 2

AFAICT，您當前正在將文件的全部內容復制到strBuffer 。

我認為您想做的是改為使用boost::iostreams::stream和您的mapped_file_source 。

這是一個未經測試的示例，基於鏈接的文檔：

// Create the stream
boost::iostreams::stream<boost::iostreams::mapped_file_source> str("some/path/file");
// Alternately, you can create the mapped_file_source separately and tell the stream to open it (using a copy of your mapped_file_source)
boost::iostreams::stream<boost::iostreams::mapped_file_source> str2;
str2.open(file);

// Now you can use std::getline as you normally would.
std::getline(str, strMaxPts);

mapped_file_source說mapped_file_source ，我會注意到默認情況下， mapped_file_source映射整個文件，因此無需顯式傳遞大小。

Answer 3

您可以使用類似這樣的東西（只是一個快速的概念，您需要添加一些其他的錯誤檢查等）：

#include "boost/iostreams/stream.hpp"
#include "boost/iostreams/device/mapped_file.hpp"
#include "boost/filesystem.hpp"
#include "boost/lexical_cast.hpp"

double parse_double(const std::string & str)
{
  double value = 0;
  bool decimal = false;
  double divisor = 1.0;
  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
  {
    switch (*it)
    {
    case '.':
    case ',':
      decimal = true;
      break;
    default:
      {
        const int x = *it - '0';
        value = value * 10 + x;
        if (decimal)
          divisor *= 10;
      }
      break;
    }
  }
  return value / divisor;
}


void process_value(const bool initialized, const std::string & str, std::vector< double > & values)
{
  if (!initialized)
  {
    // convert the value count and prepare the output vector
    const size_t count = boost::lexical_cast< size_t >(str);
    values.reserve(count);
  }
  else
  {
    // convert the value
    //const double value = 0; // ~ 0:20 min
    const double value = parse_double(str); // ~ 0:35 min
    //const double value = atof(str.c_str()); // ~ 1:20 min
    //const double value = boost::lexical_cast< double >(str); // ~ 8:00 min ?!?!?
    values.push_back(value);
  }
}


bool load_file(const std::string & name, std::vector< double > & values)
{
  const int granularity = boost::iostreams::mapped_file_source::alignment();
  const boost::uintmax_t chunk_size = ( (256 /* MB */ << 20 ) / granularity ) * granularity;
  boost::iostreams::mapped_file_params in_params(name);
  in_params.offset = 0;
  boost::uintmax_t left = boost::filesystem::file_size(name);
  std::string value;
  bool whitespace = true;
  bool initialized = false;
  while (left > 0)
  {
    in_params.length = static_cast< size_t >(std::min(chunk_size, left));
    boost::iostreams::mapped_file_source in(in_params);
    if (!in.is_open())
      return false;
    const boost::iostreams::mapped_file_source::size_type size = in.size();
    const char * data = in.data();
    for (boost::iostreams::mapped_file_source::size_type i = 0; i < size; ++i, ++data)
    {
      const char c = *data;
      if (strchr(" \t\n\r", c))
      {
        // c is whitespace
        if (!whitespace)
        {
          whitespace = true;
          // finished previous value
          process_value(initialized, value, values);
          initialized = true;
          // start a new value
          value.clear();
        }
      }
      else
      {
        // c is not whitespace
        whitespace = false;
        // append the char to the value
        value += c;
      }
    }
    if (size < chunk_size)
      break;
    in_params.offset += chunk_size;
    left -= chunk_size;
  }
  if (!whitespace)
  {
    // convert the last value
    process_value(initialized, value, values);
  }
  return true;
}

請注意，您的主要問題將是從字符串到浮點的轉換，這是非常緩慢的（對於boost :: lexical_cast來說，這是如此之慢）。 使用我自定義的特殊parse_double func速度更快，但是它僅允許一種特殊格式（例如，如果允許負值，則需要添加符號檢測等。或者，如果需要所有可能的格式，則可以只使用atof）。

如果您想更快地解析文件，則可能需要進行多線程處理-例如，一個線程僅解析字符串值，而另一個線程則將加載的字符串值轉換為浮點數。 在那種情況下，您可能甚至不需要內存映射文件，因為常規的緩沖文件讀取可能就足夠了（無論如何，該文件只能讀取一次）。

Answer 4

關於代碼的一些快速注釋：1）您沒有為向量保留空間，因此每次添加值時它都會進行擴展。 您已經從文件中讀取了點數，因此在clear（）之后調用reserve（N）。

2）您正在一次命中整個文件的映射，該映射可以在64位上運行，但速度可能很慢，並且正在用strBuffer << pbuffer強制另一次分配相同數量的內存；

http://www.boost.org/doc/libs/1_53_0/doc/html/interprocess/sharedmemorybetweenprocesses.html#interprocess.sharedmemorybetweenprocesses.mapped_file.mapped_file_mapping_regions顯示了如何獲取區域

通過getRegion使用循環來加載包含許多行的估計數據塊。 您將必須處理部分緩沖區-每個getRegion都可能以您需要保留的行的一部分結尾，並連接到從下一個區域開始的下一個部分緩沖區。

高效解析mmap文件

問題描述

4 個解決方案

解決方案1
2 已采納 2013-06-29 22:26:51

解決方案2
1 2013-06-24 10:54:27

解決方案3
1 2013-06-25 23:07:46

解決方案4
0 2013-06-25 17:49:05

高效解析mmap文件

問題描述

4 個解決方案

解決方案1 2 已采納 2013-06-29 22:26:51

解決方案2 1 2013-06-24 10:54:27

解決方案3 1 2013-06-25 23:07:46

解決方案4 0 2013-06-25 17:49:05

解決方案1
2 已采納 2013-06-29 22:26:51

解決方案2
1 2013-06-24 10:54:27

解決方案3
1 2013-06-25 23:07:46

解決方案4
0 2013-06-25 17:49:05