libstdc++
regex_compiler.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_compiler.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
34 // (http://swtch.com/~rsc/regexp/regexp1.html"),
35 // but doesn't strictly follow it.
36 //
37 // When compiling, states are *chained* instead of tree- or graph-constructed.
38 // It's more like structured programs: there's if statement and loop statement.
39 //
40 // For alternative structure(say "a|b"), aka "if statement", two branchs should
41 // be constructed. However, these two shall merge to an "end_tag" at the end of
42 // this operator:
43 //
44 // branch1
45 // / \
46 // => begin_tag end_tag =>
47 // \ /
48 // branch2
49 //
50 // This is the difference between this implementation and that in Russ's
51 // article.
52 //
53 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
54 // All dummy node will be eliminated at the end of compiling process.
55 
56 namespace std _GLIBCXX_VISIBILITY(default)
57 {
58 namespace __detail
59 {
60 _GLIBCXX_BEGIN_NAMESPACE_VERSION
61 
62  template<typename _FwdIter, typename _TraitsT>
63  _Compiler<_FwdIter, _TraitsT>::
64  _Compiler(_FwdIter __b, _FwdIter __e,
65  const _TraitsT& __traits, _FlagT __flags)
66  : _M_flags((__flags
67  & (regex_constants::ECMAScript
68  | regex_constants::basic
69  | regex_constants::extended
70  | regex_constants::grep
71  | regex_constants::egrep
72  | regex_constants::awk))
73  ? __flags
74  : __flags | regex_constants::ECMAScript),
75  _M_traits(__traits),
76  _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())),
77  _M_scanner(__b, __e, _M_flags, _M_traits.getloc()),
78  _M_nfa(_M_flags)
79  {
80  _StateSeqT __r(_M_nfa, _M_nfa._M_start());
81  __r._M_append(_M_nfa._M_insert_subexpr_begin());
82  this->_M_disjunction();
83  if (!_M_match_token(_ScannerT::_S_token_eof))
84  __throw_regex_error(regex_constants::error_paren);
85  __r._M_append(_M_pop());
86  _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
87  __r._M_append(_M_nfa._M_insert_subexpr_end());
88  __r._M_append(_M_nfa._M_insert_accept());
89  _M_nfa._M_eliminate_dummy();
90  }
91 
92  template<typename _FwdIter, typename _TraitsT>
93  void
94  _Compiler<_FwdIter, _TraitsT>::
95  _M_disjunction()
96  {
97  this->_M_alternative();
98  while (_M_match_token(_ScannerT::_S_token_or))
99  {
100  _StateSeqT __alt1 = _M_pop();
101  this->_M_alternative();
102  _StateSeqT __alt2 = _M_pop();
103  auto __end = _M_nfa._M_insert_dummy();
104  __alt1._M_append(__end);
105  __alt2._M_append(__end);
106  _M_stack.push(_StateSeqT(_M_nfa,
107  _M_nfa._M_insert_alt(__alt1._M_start,
108  __alt2._M_start, false),
109  __end));
110  }
111  }
112 
113  template<typename _FwdIter, typename _TraitsT>
114  void
115  _Compiler<_FwdIter, _TraitsT>::
116  _M_alternative()
117  {
118  if (this->_M_term())
119  {
120  _StateSeqT __re = _M_pop();
121  this->_M_alternative();
122  __re._M_append(_M_pop());
123  _M_stack.push(__re);
124  }
125  else
126  _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
127  }
128 
129  template<typename _FwdIter, typename _TraitsT>
130  bool
131  _Compiler<_FwdIter, _TraitsT>::
132  _M_term()
133  {
134  if (this->_M_assertion())
135  return true;
136  if (this->_M_atom())
137  {
138  this->_M_quantifier();
139  return true;
140  }
141  return false;
142  }
143 
144  template<typename _FwdIter, typename _TraitsT>
145  bool
146  _Compiler<_FwdIter, _TraitsT>::
147  _M_assertion()
148  {
149  if (_M_match_token(_ScannerT::_S_token_line_begin))
150  _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin()));
151  else if (_M_match_token(_ScannerT::_S_token_line_end))
152  _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end()));
153  else if (_M_match_token(_ScannerT::_S_token_word_bound))
154  // _M_value[0] == 'n' means it's negtive, say "not word boundary".
155  _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
156  _M_insert_word_bound(_M_value[0] == 'n')));
157  else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
158  {
159  auto __neg = _M_value[0] == 'n';
160  this->_M_disjunction();
161  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
162  __throw_regex_error(regex_constants::error_paren);
163  auto __tmp = _M_pop();
164  __tmp._M_append(_M_nfa._M_insert_accept());
165  _M_stack.push(
166  _StateSeqT(
167  _M_nfa,
168  _M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
169  }
170  else
171  return false;
172  return true;
173  }
174 
175  template<typename _FwdIter, typename _TraitsT>
176  void
177  _Compiler<_FwdIter, _TraitsT>::
178  _M_quantifier()
179  {
180  bool __neg = (_M_flags & regex_constants::ECMAScript);
181  auto __init = [this, &__neg]()
182  {
183  if (_M_stack.empty())
184  __throw_regex_error(regex_constants::error_badrepeat);
185  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
186  };
187  if (_M_match_token(_ScannerT::_S_token_closure0))
188  {
189  __init();
190  auto __e = _M_pop();
191  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
192  __e._M_start, __neg));
193  __e._M_append(__r);
194  _M_stack.push(__r);
195  }
196  else if (_M_match_token(_ScannerT::_S_token_closure1))
197  {
198  __init();
199  auto __e = _M_pop();
200  __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
201  __neg));
202  _M_stack.push(__e);
203  }
204  else if (_M_match_token(_ScannerT::_S_token_opt))
205  {
206  __init();
207  auto __e = _M_pop();
208  auto __end = _M_nfa._M_insert_dummy();
209  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
210  __e._M_start, __neg));
211  __e._M_append(__end);
212  __r._M_append(__end);
213  _M_stack.push(__r);
214  }
215  else if (_M_match_token(_ScannerT::_S_token_interval_begin))
216  {
217  if (_M_stack.empty())
218  __throw_regex_error(regex_constants::error_badrepeat);
219  if (!_M_match_token(_ScannerT::_S_token_dup_count))
220  __throw_regex_error(regex_constants::error_badbrace);
221  _StateSeqT __r(_M_pop());
222  _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy());
223  long __min_rep = _M_cur_int_value(10);
224  bool __infi = false;
225  long __n;
226 
227  // {3
228  if (_M_match_token(_ScannerT::_S_token_comma))
229  if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
230  __n = _M_cur_int_value(10) - __min_rep;
231  else
232  __infi = true;
233  else
234  __n = 0;
235  if (!_M_match_token(_ScannerT::_S_token_interval_end))
236  __throw_regex_error(regex_constants::error_brace);
237 
238  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
239 
240  for (long __i = 0; __i < __min_rep; ++__i)
241  __e._M_append(__r._M_clone());
242 
243  if (__infi)
244  {
245  auto __tmp = __r._M_clone();
246  _StateSeqT __s(_M_nfa,
247  _M_nfa._M_insert_alt(_S_invalid_state_id,
248  __tmp._M_start, __neg));
249  __tmp._M_append(__s);
250  __e._M_append(__s);
251  }
252  else
253  {
254  if (__n < 0)
255  __throw_regex_error(regex_constants::error_badbrace);
256  auto __end = _M_nfa._M_insert_dummy();
257  // _M_alt is the "match more" branch, and _M_next is the
258  // "match less" one. Switch _M_alt and _M_next of all created
259  // nodes. This is a hacking but IMO works well.
260  std::stack<_StateIdT> __stack;
261  for (long __i = 0; __i < __n; ++__i)
262  {
263  auto __tmp = __r._M_clone();
264  auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
265  __end, __neg);
266  __stack.push(__alt);
267  __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
268  }
269  __e._M_append(__end);
270  while (!__stack.empty())
271  {
272  auto& __tmp = _M_nfa[__stack.top()];
273  __stack.pop();
274  swap(__tmp._M_next, __tmp._M_alt);
275  }
276  }
277  _M_stack.push(__e);
278  }
279  }
280 
281  template<typename _FwdIter, typename _TraitsT>
282  bool
283  _Compiler<_FwdIter, _TraitsT>::
284  _M_atom()
285  {
286  if (_M_match_token(_ScannerT::_S_token_anychar))
287  _M_stack.push(_StateSeqT(_M_nfa,
288  _M_nfa._M_insert_matcher
289  (_AnyMatcher<_TraitsT>(_M_traits))));
290  else if (_M_try_char())
291  _M_stack.push(_StateSeqT(_M_nfa,
292  _M_nfa._M_insert_matcher
293  (_CharMatcher<_TraitsT>(_M_value[0],
294  _M_traits,
295  _M_flags))));
296  else if (_M_match_token(_ScannerT::_S_token_backref))
297  _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
298  _M_insert_backref(_M_cur_int_value(10))));
299  else if (_M_match_token(_ScannerT::_S_token_quoted_class))
300  {
301  _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
302  _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]),
303  _M_traits, _M_flags);
304  __matcher._M_add_character_class(_M_value);
305  _M_stack.push(_StateSeqT(_M_nfa,
306  _M_nfa._M_insert_matcher(std::move(__matcher))));
307  }
308  else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
309  {
310  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy());
311  this->_M_disjunction();
312  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
313  __throw_regex_error(regex_constants::error_paren);
314  __r._M_append(_M_pop());
315  _M_stack.push(__r);
316  }
317  else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
318  {
319  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin());
320  this->_M_disjunction();
321  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
322  __throw_regex_error(regex_constants::error_paren);
323  __r._M_append(_M_pop());
324  __r._M_append(_M_nfa._M_insert_subexpr_end());
325  _M_stack.push(__r);
326  }
327  else if (!_M_bracket_expression())
328  return false;
329  return true;
330  }
331 
332  template<typename _FwdIter, typename _TraitsT>
333  bool
334  _Compiler<_FwdIter, _TraitsT>::
335  _M_bracket_expression()
336  {
337  bool __neg =
338  _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
339  if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
340  return false;
341  _BMatcherT __matcher(__neg, _M_traits, _M_flags);
342  while (!_M_match_token(_ScannerT::_S_token_bracket_end))
343  _M_expression_term(__matcher);
344  _M_stack.push(_StateSeqT(_M_nfa,
345  _M_nfa._M_insert_matcher(std::move(__matcher))));
346  return true;
347  }
348 
349  template<typename _FwdIter, typename _TraitsT>
350  void
351  _Compiler<_FwdIter, _TraitsT>::
352  _M_expression_term(_BMatcherT& __matcher)
353  {
354  if (_M_match_token(_ScannerT::_S_token_collsymbol))
355  __matcher._M_add_collating_element(_M_value);
356  else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
357  __matcher._M_add_equivalence_class(_M_value);
358  else if (_M_match_token(_ScannerT::_S_token_char_class_name))
359  __matcher._M_add_character_class(_M_value);
360  else if (_M_try_char()) // [a
361  {
362  auto __ch = _M_value[0];
363  if (_M_try_char())
364  {
365  if (_M_value[0] == '-') // [a-
366  {
367  if (_M_try_char()) // [a-z]
368  {
369  __matcher._M_make_range(__ch, _M_value[0]);
370  return;
371  }
372  // If the dash is the last character in the bracket
373  // expression, it is not special.
374  if (_M_scanner._M_get_token()
375  != _ScannerT::_S_token_bracket_end)
376  __throw_regex_error(regex_constants::error_range);
377  }
378  __matcher._M_add_char(_M_value[0]);
379  }
380  __matcher._M_add_char(__ch);
381  }
382  else
383  __throw_regex_error(regex_constants::error_brack);
384  }
385 
386  template<typename _FwdIter, typename _TraitsT>
387  bool
388  _Compiler<_FwdIter, _TraitsT>::
389  _M_try_char()
390  {
391  bool __is_char = false;
392  if (_M_match_token(_ScannerT::_S_token_oct_num))
393  {
394  __is_char = true;
395  _M_value.assign(1, _M_cur_int_value(8));
396  }
397  else if (_M_match_token(_ScannerT::_S_token_hex_num))
398  {
399  __is_char = true;
400  _M_value.assign(1, _M_cur_int_value(16));
401  }
402  else if (_M_match_token(_ScannerT::_S_token_ord_char))
403  __is_char = true;
404  return __is_char;
405  }
406 
407  template<typename _FwdIter, typename _TraitsT>
408  bool
409  _Compiler<_FwdIter, _TraitsT>::
410  _M_match_token(_TokenT token)
411  {
412  if (token == _M_scanner._M_get_token())
413  {
414  _M_value = _M_scanner._M_get_value();
415  _M_scanner._M_advance();
416  return true;
417  }
418  return false;
419  }
420 
421  template<typename _FwdIter, typename _TraitsT>
422  int
423  _Compiler<_FwdIter, _TraitsT>::
424  _M_cur_int_value(int __radix)
425  {
426  long __v = 0;
427  for (typename _StringT::size_type __i = 0;
428  __i < _M_value.length(); ++__i)
429  __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
430  return __v;
431  }
432 
433  template<typename _TraitsT>
434  bool
435  _BracketMatcher<_TraitsT>::operator()(_CharT __ch) const
436  {
437  bool __ret = false;
438  if (_M_traits.isctype(__ch, _M_class_set)
439  || _M_char_set.count(_M_translate(__ch))
440  || _M_equiv_set.count(_M_traits.transform_primary(&__ch, &__ch+1)))
441  __ret = true;
442  else
443  {
444  _StringT __s = _M_get_str(_M_flags & regex_constants::collate
445  ? _M_translate(__ch) : __ch);
446  for (auto& __it : _M_range_set)
447  if (__it.first <= __s && __s <= __it.second)
448  {
449  __ret = true;
450  break;
451  }
452  }
453  if (_M_is_non_matching)
454  return !__ret;
455  else
456  return __ret;
457  }
458 
459 _GLIBCXX_END_NAMESPACE_VERSION
460 } // namespace __detail
461 } // namespace
const _Facet & use_facet(const locale &__loc)
Return a facet.use_facet looks for and returns a reference to a facet of type Facet where Facet is th...
reference top()
Definition: stl_stack.h:159
void pop()
Removes first element.
Definition: stl_stack.h:212
constexpr error_type error_badrepeat(_S_error_badrepeat)
constexpr error_type error_range(_S_error_range)
constexpr error_type error_brack(_S_error_brack)
constexpr error_type error_badbrace(_S_error_badbrace)
void push(const value_type &__x)
Add data to the top of the stack.
Definition: stl_stack.h:186
void swap(function< _Res(_Args...)> &__x, function< _Res(_Args...)> &__y)
Swap the targets of two polymorphic function object wrappers.
Definition: functional:2534
constexpr error_type error_paren(_S_error_paren)
A standard container giving FILO behavior.
Definition: stl_stack.h:96
constexpr error_type error_brace(_S_error_brace)