Tokenizer 作りなおした - 理系学生日記

Shell クラスからトークンを切り出す機能を独立させることにしました．Tokenizer クラスに持たせた機能は，セミコロンあるいはパイプで区切られる "文" をトークンに切り出すこと．仕様としては，リダイレクト記号 ('<'，'>') やアンバサンドの前後にはスペースを空けましょうねということです．スペースなしを許してしまうと，いろいろメンドい．

#ifndef TOKENIZER_H
#define TOKENIZER_H

#include <string>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>

class Tokenizer {
  typedef boost::escaped_list_separator<char> separator_t;
  typedef boost::tokenizer< separator_t >     tokenizer_t;
  typedef boost::shared_ptr< std::string >    ptr_str_t;
  typedef std::vector< ptr_str_t >            ptr_str_vec;

private:
  std::string str;
  separator_t separator;
  tokenizer_t tokenizer;
  ptr_str_vec tokens;
  std::string input_file;
  std::string output_file;

  bool is_background;
  bool is_iredirect;
  bool is_oredirect;

public:
  typedef ptr_str_vec::iterator               iterator;
  typedef ptr_str_vec::const_iterator         const_iterator;

  Tokenizer() : separator( "\\", " \t", "\"'" ), tokenizer( str, separator ), 
                is_background( false ), is_iredirect( false ), is_oredirect( false )
  {};

  virtual ~Tokenizer() {}
  void sentence_separate();
  void tokenize( const std::string& );
  void output() const;
  void reset();

  const std::string& get_input_file()  const { return input_file;  }
  const std::string& get_output_file() const { return output_file; }
  iterator begin() { tokens.begin(); }
  iterator end()   { tokens.end();   }
  bool is_bg() const { return is_background; }
  bool is_ir() const { return is_iredirect;  }
  bool is_or() const { return is_oredirect;  }
};

#endif

実際にトークンに切り出す部分は Tokenizer::tokenize なのだけど，記号の前後にはスペースを空けましょうという決まりを作ったおかげで，ちょっと楽になりました．切り出したトークンは，shared_ptr 用の vector クラスに放り込みます．

void Tokenizer::tokenize( const std::string& sentence ) {
  tokenizer.assign( sentence );
  tokens.clear();

  for ( tokenizer_t::iterator it = tokenizer.begin(); it != tokenizer.end(); ++it ) {
    std::string *pstr = new std::string( *it );

    if ( pstr->compare( "&" ) == 0 ) { 
      is_background = true; continue; 
    }
    else if ( pstr->compare( ">" ) == 0 ) {
      while( (++it)->empty() ); 
      output_file = std::string( *it );
      is_oredirect = true;
      continue;
    }
    else if ( pstr->compare( "<" ) == 0 ) {
      while( (++it)->empty() ); 
      input_file = std::string( *it );
      is_iredirect = true;
      continue;
    }
    
    if ( ! pstr->empty() ) {
      ptr_str_t p( pstr );
      tokens.push_back( p );
    }
  }
}

main ループとしてはこんな感じ．今回は単にトークンに区切れていれば OK です．

int main() {
  std::string line;
  Tokenizer tok;

  while( 1 ) {
    std::cout << "input: " << std::flush;

    getline( std::cin, line );
    if ( line.compare( "end" ) == 0 )
      return 0;

    tok.tokenize( line );
    tok.output();
    tok.reset();
  }
}

$ ./a.out
input: echo "hi \"hello world\""
echo
hi "hello world"
[ bg=0 ]
input: cat   < a.txt   > y.txt
cat
[ bg=0 if=a.txt  of=y.txt  ]
input: end