ragel解析器不貪心？

Question

我正在嘗試使用ragel在 C++ 中編寫 HTTP 解析器，但發現生成的解析器並不貪婪。

第一步是解析一個 URL，這里是從RFC-3986翻譯的 ragel 語法：

#include <string>
#include <iostream>
#include <string_view>

%%{

    machine http_uri_parser;

    action mark_scheme {
        std::printf("mark scheme, p:%s\n", p);
        this->scheme.data = p;
    }
    action store_scheme {
        std::printf("store scheme, p:%s\n", p);
        this->scheme.len = p - this->scheme.data;
    }

    action mark_authority {
        std::printf("mark authority, p:%s\n", p);
        this->authority.data = p;
    }
    action store_authority {
        std::printf("store authority, p:%s\n", p);
        this->authority.len = p - this->authority.data;
    }

    action mark_userinfo {
        std::printf("mark userinfo, p:%s\n", p);
        this->userinfo.data = p;
    }
    action store_userinfo {
        std::printf("store userinfo, p:%s\n", p);
        this->userinfo.len = p - this->userinfo.data;
    }

    action mark_host {
        std::printf("mark host, p:%s\n", p);
        this->host.data = p;
    }
    action store_host {
        std::printf("store host, p:%s\n", p);
        this->host.len = p - this->host.data;
    }

    action mark_port {
        std::printf("mark port, p:%s\n", p);
        this->port.data = p;
    }
    action store_port {
        std::printf("store port, p:%s\n", p);
        this->port.len = p - this->port.data;
    }

    action mark_path {
        std::printf("mark path, p:%s\n", p);
        this->path.data = p;
    }

    action store_path {
        std::printf("store path, p:%s\n", p);
        this->path.len = p - this->path.data;
    }

    action mark_query {
        std::printf("mark query, p:%s\n", p);
        this->query.data = p;
    }
    action store_query {
        std::printf("store query, p:%s\n", p);
        this->query.len = p - this->query.data;
    }

    action mark_fragment {
        std::printf("mark fragment, p:%s\n", p);
        this->fragment.data = p;
    }
    action store_fragment {
        std::printf("store fragment, p:%s\n", p);
        this->fragment.len = p - this->fragment.data;
    }

    action done {
        std::printf("parser done, p:%s\n", p);
        this->_done = 1;
        fbreak;
    }


###############################################################################
# Characters
###############################################################################

    crlf          = '\r\n';
    gen_delims    = ( ':' | '/' | '?' | '#' | '[' | ']' | '@' );
    sub_delims    = ( '!' | '$' | '&' | "'" | '(' | ')'
                    | '*' | '+' | ',' | ';' | '=' );

    reserved      = ( gen_delims | sub_delims );
    unreserved    = ( alpha | digit | '-' | '.' | '_' | '~' );
    pct_encoded   = ( '%' xdigit xdigit );

###############################################################################
# Scheme
###############################################################################

    scheme        = ( alpha ( alpha | digit | '+' | '-' | '.' )* )
                    >mark_scheme %store_scheme;

###############################################################################
# Authority
###############################################################################

    dec_octet     = ( ( digit                  ) # 0-9
                    | ( ( '1'..'9' ) digit     ) # 10-99
                    | ( '1' digit{2}           ) # 100-199
                    | ( '2' ( '0'..'4' ) digit ) # 200-249
                    | ( '25' ( '0'..'5' )      ) # 250-255
                    );

    IPv4_address  = ( dec_octet '.' dec_octet '.' dec_octet '.' dec_octet );

    h16           = ( xdigit{1,4} );
                  # 16 bits of address represented in hexadecimal
    ls32          = ( ( h16 ':' h16 ) | IPv4_address );
                  # least-significant 32 bits of address

    IPv6_address  = ( (                               ( h16 ':' ){6} ls32 )
                    | (                          '::' ( h16 ':' ){5} ls32 )
                    | ( (                 h16 )? '::' ( h16 ':' ){4} ls32 )
                    | ( ( ( h16 ':' ){,1} h16 )? '::' ( h16 ':' ){3} ls32 )
                    | ( ( ( h16 ':' ){,2} h16 )? '::' ( h16 ':' ){2} ls32 )
                    | ( ( ( h16 ':' ){,3} h16 )? '::' ( h16 ':' ){1} ls32 )
                    | ( ( ( h16 ':' ){,4} h16 )? '::'                ls32 )
                    | ( ( ( h16 ':' ){,5} h16 )? '::'                h16  )
                    | ( ( ( h16 ':' ){,6} h16 )? '::'                     )
                    );
    IPv_future    = ( 'v' ( ( xdigit+ ) '.' ) ( unreserved | sub_delims | ':' )+ );
    IP_literal    = ( '[' ( IPv6_address | IPv_future  ) ']' );

    reg_name      = ( ( unreserved | pct_encoded | sub_delims )* )
                  > { std::printf("mark reg_name, p:%s\n", p); }
                  % { std::printf("store reg_name, p:%s\n", p); };

    port          = ( digit* )
                    >mark_port %store_port;
    host          = ( IP_literal | IPv4_address | reg_name )
                    >mark_host %store_host;
    userinfo      = ( ( unreserved | pct_encoded | sub_delims | ':' )* )
                    >mark_userinfo %store_userinfo;
    authority     = ( ( userinfo '@' )? host ( ':' port )? )
                    >mark_authority %store_authority;

###############################################################################
# Path
###############################################################################

    pchar         = ( unreserved | pct_encoded | sub_delims | ':' | '@' );

    segment       = ( pchar* );
    segment_nz    = ( pchar+ );
                  # non-zero-length
    segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | '@' )+ );
                  # non-zero-length segment without any colon ':'

    path_abempty  = ( ( '/' segment )* )
                  >mark_path %store_path;
    path_absolute = ( '/' ( segment_nz ( '/' segment )* )? )
                  >mark_path %store_path;
    path_noscheme = ( segment_nz_nc ( '/' segment )* )
                  >mark_path %store_path;
    path_rootless = ( segment_nz ( '/' segment )* )
                  >mark_path %store_path;
    path_empty    = ( zlen )
                  >mark_path %store_path;

    path          = ( path_abempty    # begins with '/' or is empty
                    | path_absolute   # begins with '/' but not '//'
                    | path_noscheme   # begins with a non-colon segment
                    | path_rootless   # begins with a segment
                    | path_empty      # zero characters
                    );

###############################################################################
# Query
###############################################################################

    query         = ( ( pchar | '/' | '?' )* )
                    >mark_query %store_query;

###############################################################################
# Fragment
###############################################################################

    fragment      = ( ( pchar | '/' | '?' )* )
                    >mark_fragment %store_fragment;

###############################################################################
# URI
###############################################################################

    hier_part     = ( ( '//' authority path_abempty )
                    | ( path_absolute               )
                    | ( path_rootless               )
                    | ( path_empty                  )
                    );

    relative_part = ( ( '//' authority path_abempty )
                    | ( path_absolute               )
                    | ( path_noscheme               )
                    | ( path_empty                  )
                    );

    absolute_URI  = ( scheme ':' hier_part ( '?' query )? );

    relative_ref  = ( relative_part ( '?' query )? ( '#' fragment )? );
    URI           = ( scheme ':' hier_part ( '?' query )? ( '#' fragment )? );

    URI_reference = ( URI | relative_ref );

###############################################################################
# main rule
###############################################################################

    main         := URI @done;
}%%

%% write data;

struct slice {
  size_t      len{};
  const char* data{};
};

struct http_parser {
  http_parser()  = default;
  ~http_parser() = default;

  void reset();
  void execute();

  int state = 0;

  std::string uri;

  /* parsed result */

  slice scheme{};
  slice authority{};
  slice userinfo{};
  slice host{};
  slice port{};
  slice path{};
  slice query{};
  slice fragment{};

  /* parse status */

  bool _eof{};
  bool _done{};
  bool _failed{};
};

void http_parser::reset() {
  int cs = 0;

  %% write init;

  this->state   = cs;
  this->_eof    = false;
  this->_done   = false;
  this->_failed = false;

  this->scheme    = slice{};
  this->authority = slice{};
  this->userinfo  = slice{};
  this->host      = slice{};
  this->port      = slice{};
  this->path      = slice{};
  this->query     = slice{};
  this->fragment  = slice{};
}

void http_parser::execute() {
  const char* p   = &this->uri.front();
  const char* pe  = &this->uri.back() + 1;
  const char* eof = pe;
  int         cs  = this->state;

  %% write exec;

  if (!this->_failed) {
    this->state = cs;
  }

  std::printf(
      "eof:%d, done:%d, failed:%d, state:%d, p:%p, pe:%p, diff:%ld, rest:%s\n",
      this->_eof, this->_done, this->_failed, this->state, p, pe, pe - p, p);

#define print_parser_component(fld)                                            \
  if (this->fld.len) {                                                         \
    std::printf(#fld ": %.*s\n", (int)this->fld.len, this->fld.data);          \
  }

  print_parser_component(scheme);
  print_parser_component(authority);
  print_parser_component(userinfo);
  print_parser_component(host);
  print_parser_component(port);
  print_parser_component(path);
  print_parser_component(query);
  print_parser_component(fragment);

#undef print_parser_component
}

在這里，我將main規則設置為URL而不是URI_reference ，以便首先測試絕對 URL。

這是測試代碼：

int main(int argc, char** argv) {
  auto parser = std::make_unique<http_parser>();
  parser->uri =
      "https://chenjianyong.com/blog/2022/01/"
      "seastar_fpc_1.html?hello=world#preface";
  parser->reset();
  parser->execute();
  return 0;
}

運行程序並打印：

mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
parser done, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
eof:0, done:1, failed:0, state:171, p:0x6000016f4006, pe:0x6000016f404c, diff:70, rest://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
scheme: https

似乎解析器在解析完https://方案后停止了，這很奇怪嗎？ 為什么它不貪婪地消耗到最后一個字節？

將主要規則更改為main:= (crlf @done); , append a crlf來測試 URL，重新生成解析器，這一次解析器可以消費到最后，打印顯示所有 ZE6B391A8D2C4D45902A23A8BD6587 組件都成功解析：

mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark path, p://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark authority, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark userinfo, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark host, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark reg_name, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store reg_name, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store host, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store authority, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark path, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store path, p:?hello=world#preface

mark query, p:hello=world#preface

store query, p:#preface

mark fragment, p:preface

store fragment, p:

parser done, p:

eof:0, done:1, failed:0, state:188, p:0x60000140c04e, pe:0x60000140c04e, diff:0, rest:
scheme: https
authority: chenjianyong.com
host: chenjianyong.com
path: /blog/2022/01/seastar_fpc_1.html
query: hello=world
fragment: preface

那么，為什么我的 ragel 解析器不貪心呢？

Answer 1

為什么它不貪婪地消耗到最后一個字節？

正如您已經注意到的那樣，您done的操作是在冒號之后執行的（里面有fbreak ），我們可以通過渲染您的（相當大的一個！）FSM 來確認它（ ragel -o f.dot -Vp source.c++ && dot -Tpng -o f.png f.dot ）：

它的執行是因為main規范說@done ，根據文檔

將動作嵌入到將機器移動到最終 state 的任何轉換中

如您所見，171 是您機器可能的最終狀態之一。 如果您要將main規范更改為URI %done ，情況會有所不同：

done不再執行，它只會在

go 通過最終 state 離開機器的轉換

我認為這更適合您的情況。

現在你可能想知道為什么 171 是最終狀態之一，那是因為hier_part可以是path_empty ，它是一個zlen ，所以機器可以在這個 state 中完成（對於scheme:輸入）->它是最終狀態之一states -> @done在轉換到此 state 時執行。

ragel解析器不貪心？

問題描述

1 個解決方案

解決方案1
0 2022-08-25 16:07:25

ragel解析器不貪心？

問題描述

1 個解決方案

解決方案1 0 2022-08-25 16:07:25

解決方案1
0 2022-08-25 16:07:25