libstdc++
regex_scanner.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55  template<typename _FwdIter>
56  _Scanner<_FwdIter>::
57  _Scanner(_FwdIter __begin, _FwdIter __end,
58  _FlagT __flags, std::locale __loc)
59  : _M_state(_S_state_normal), _M_current(__begin), _M_end(__end),
60  _M_flags(__flags),
61  _M_ctype(std::use_facet<_CtypeT>(__loc)),
62  _M_at_bracket_start(false),
63  _M_token_map
64  {
65  {'^', _S_token_line_begin},
66  {'$', _S_token_line_end},
67  {'.', _S_token_anychar},
68  {'*', _S_token_closure0},
69  {'+', _S_token_closure1},
70  {'?', _S_token_opt},
71  {'|', _S_token_or},
72  // grep and egrep
73  {'\n', _S_token_or},
74  },
75  _M_ecma_escape_map
76  {
77  {'0', '\0'},
78  {'b', '\b'},
79  {'f', '\f'},
80  {'n', '\n'},
81  {'r', '\r'},
82  {'t', '\t'},
83  {'v', '\v'},
84  },
85  _M_awk_escape_map
86  {
87  {'"', '"'},
88  {'/', '/'},
89  {'\\', '\\'},
90  {'a', '\a'},
91  {'b', '\b'},
92  {'f', '\f'},
93  {'n', '\n'},
94  {'r', '\r'},
95  {'t', '\t'},
96  {'v', '\v'},
97  },
98  _M_ecma_spec_char
99  {
100  '^',
101  '$',
102  '\\',
103  '.',
104  '*',
105  '+',
106  '?',
107  '(',
108  ')',
109  '[',
110  ']',
111  '{',
112  '}',
113  '|',
114  },
115  _M_basic_spec_char
116  {
117  '.',
118  '[',
119  '\\',
120  '*',
121  '^',
122  '$',
123  },
124  _M_extended_spec_char
125  {
126  '.',
127  '[',
128  '\\',
129  '(',
130  ')',
131  '*',
132  '+',
133  '?',
134  '{',
135  '|',
136  '^',
137  '$',
138  },
139  _M_escape_map(_M_is_ecma()
140  ? _M_ecma_escape_map
141  : _M_awk_escape_map),
142  _M_spec_char(_M_is_ecma()
143  ? _M_ecma_spec_char
144  : _M_is_basic()
145  ? _M_basic_spec_char
146  : _M_extended_spec_char),
147  _M_eat_escape(_M_is_ecma()
148  ? &_Scanner::_M_eat_escape_ecma
149  : &_Scanner::_M_eat_escape_posix)
150  { _M_advance(); }
151 
152  template<typename _FwdIter>
153  void
154  _Scanner<_FwdIter>::
155  _M_advance()
156  {
157  if (_M_current == _M_end)
158  {
159  _M_token = _S_token_eof;
160  return;
161  }
162 
163  if (_M_state == _S_state_normal)
164  _M_scan_normal();
165  else if (_M_state == _S_state_in_bracket)
166  _M_scan_in_bracket();
167  else if (_M_state == _S_state_in_brace)
168  _M_scan_in_brace();
169  else
170  _GLIBCXX_DEBUG_ASSERT(false);
171  }
172 
173  // Differences between styles:
174  // 1) "\(", "\)", "\{" in basic. It's not escaping.
175  // 2) "(?:", "(?=", "(?!" in ECMAScript.
176  template<typename _FwdIter>
177  void
178  _Scanner<_FwdIter>::
179  _M_scan_normal()
180  {
181  auto __c = *_M_current++;
182 
183  if (__c == '\\')
184  {
185  if (_M_current == _M_end)
186  __throw_regex_error(regex_constants::error_escape);
187 
188  if (!_M_is_basic()
189  || (*_M_current != '('
190  && *_M_current != ')'
191  && *_M_current != '{'))
192  {
193  (this->*_M_eat_escape)();
194  return;
195  }
196  __c = *_M_current++;
197  }
198  if (__c == '(')
199  {
200  if (_M_is_ecma() && *_M_current == '?')
201  {
202  if (++_M_current == _M_end)
203  __throw_regex_error(regex_constants::error_paren);
204 
205  if (*_M_current == ':')
206  {
207  ++_M_current;
208  _M_token = _S_token_subexpr_no_group_begin;
209  }
210  else if (*_M_current == '=')
211  {
212  ++_M_current;
213  _M_token = _S_token_subexpr_lookahead_begin;
214  _M_value.assign(1, 'p');
215  }
216  else if (*_M_current == '!')
217  {
218  ++_M_current;
219  _M_token = _S_token_subexpr_lookahead_begin;
220  _M_value.assign(1, 'n');
221  }
222  else
223  __throw_regex_error(regex_constants::error_paren);
224  }
225  else
226  _M_token = _S_token_subexpr_begin;
227  }
228  else if (__c == ')')
229  _M_token = _S_token_subexpr_end;
230  else if (__c == '[')
231  {
232  _M_state = _S_state_in_bracket;
233  _M_at_bracket_start = true;
234  if (_M_current != _M_end && *_M_current == '^')
235  {
236  _M_token = _S_token_bracket_neg_begin;
237  ++_M_current;
238  }
239  else
240  _M_token = _S_token_bracket_begin;
241  }
242  else if (__c == '{')
243  {
244  _M_state = _S_state_in_brace;
245  _M_token = _S_token_interval_begin;
246  }
247  else if ((_M_spec_char.count(_M_ctype.narrow(__c, '\0'))
248  && __c != ']'
249  && __c != '}')
250  || (_M_is_grep() && __c == '\n'))
251  _M_token = _M_token_map.at(__c);
252  else
253  {
254  _M_token = _S_token_ord_char;
255  _M_value.assign(1, __c);
256  }
257  }
258 
259  // Differences between styles:
260  // 1) different semantics of "[]" and "[^]".
261  // 2) Escaping in bracket expr.
262  template<typename _FwdIter>
263  void
264  _Scanner<_FwdIter>::
265  _M_scan_in_bracket()
266  {
267  if (_M_current == _M_end)
268  __throw_regex_error(regex_constants::error_brack);
269 
270  auto __c = *_M_current++;
271 
272  if (__c == '[')
273  {
274  if (_M_current == _M_end)
275  __throw_regex_error(regex_constants::error_brack);
276 
277  if (*_M_current == '.')
278  {
279  _M_token = _S_token_collsymbol;
280  _M_eat_class(*_M_current++);
281  }
282  else if (*_M_current == ':')
283  {
284  _M_token = _S_token_char_class_name;
285  _M_eat_class(*_M_current++);
286  }
287  else if (*_M_current == '=')
288  {
289  _M_token = _S_token_equiv_class_name;
290  _M_eat_class(*_M_current++);
291  }
292  else
293  {
294  _M_token = _S_token_ord_char;
295  _M_value.assign(1, __c);
296  }
297  }
298  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
299  // literally. So "[]]" or "[^]]" is valid regex. See the testcases
300  // `*/empty_range.cc`.
301  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
302  {
303  _M_token = _S_token_bracket_end;
304  _M_state = _S_state_normal;
305  }
306  // ECMAScirpt and awk permmits escaping in bracket.
307  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
308  (this->*_M_eat_escape)();
309  else
310  {
311  _M_token = _S_token_ord_char;
312  _M_value.assign(1, __c);
313  }
314  _M_at_bracket_start = false;
315  }
316 
317  // Differences between styles:
318  // 1) "\}" in basic style.
319  template<typename _FwdIter>
320  void
321  _Scanner<_FwdIter>::
322  _M_scan_in_brace()
323  {
324  if (_M_current == _M_end)
325  __throw_regex_error(regex_constants::error_brace);
326 
327  auto __c = *_M_current++;
328 
329  if (_M_ctype.is(_CtypeT::digit, __c))
330  {
331  _M_token = _S_token_dup_count;
332  _M_value.assign(1, __c);
333  while (_M_current != _M_end
334  && _M_ctype.is(_CtypeT::digit, *_M_current))
335  _M_value += *_M_current++;
336  }
337  else if (__c == ',')
338  _M_token = _S_token_comma;
339  // basic use \}.
340  else if (_M_is_basic())
341  {
342  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
343  {
344  _M_state = _S_state_normal;
345  _M_token = _S_token_interval_end;
346  ++_M_current;
347  }
348  else
349  __throw_regex_error(regex_constants::error_badbrace);
350  }
351  else if (__c == '}')
352  {
353  _M_state = _S_state_normal;
354  _M_token = _S_token_interval_end;
355  }
356  else
357  __throw_regex_error(regex_constants::error_badbrace);
358  }
359 
360  template<typename _FwdIter>
361  void
362  _Scanner<_FwdIter>::
363  _M_eat_escape_ecma()
364  {
365  if (_M_current == _M_end)
366  __throw_regex_error(regex_constants::error_escape);
367 
368  auto __c = *_M_current++;
369 
370  if (_M_escape_map.count(_M_ctype.narrow(__c, '\0'))
371  && (__c != 'b' || _M_state == _S_state_in_bracket))
372  {
373  _M_token = _S_token_ord_char;
374  _M_value.assign(1, _M_escape_map.at(__c));
375  }
376  else if (__c == 'b')
377  {
378  _M_token = _S_token_word_bound;
379  _M_value.assign(1, 'p');
380  }
381  else if (__c == 'B')
382  {
383  _M_token = _S_token_word_bound;
384  _M_value.assign(1, 'n');
385  }
386  // N3376 28.13
387  else if (__c == 'd'
388  || __c == 'D'
389  || __c == 's'
390  || __c == 'S'
391  || __c == 'w'
392  || __c == 'W')
393  {
394  _M_token = _S_token_quoted_class;
395  _M_value.assign(1, __c);
396  }
397  else if (__c == 'c')
398  {
399  if (_M_current == _M_end)
400  __throw_regex_error(regex_constants::error_escape);
401  _M_token = _S_token_ord_char;
402  _M_value.assign(1, *_M_current++);
403  }
404  else if (__c == 'x' || __c == 'u')
405  {
406  _M_value.erase();
407  for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
408  {
409  if (_M_current == _M_end
410  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
411  __throw_regex_error(regex_constants::error_escape);
412  _M_value += *_M_current++;
413  }
414  _M_token = _S_token_hex_num;
415  }
416  // ECMAScript recongnizes multi-digit back-references.
417  else if (_M_ctype.is(_CtypeT::digit, __c))
418  {
419  _M_value.assign(1, __c);
420  while (_M_current != _M_end
421  && _M_ctype.is(_CtypeT::digit, *_M_current))
422  _M_value += *_M_current++;
423  _M_token = _S_token_backref;
424  }
425  else
426  {
427  _M_token = _S_token_ord_char;
428  _M_value.assign(1, __c);
429  }
430  }
431 
432  // Differences between styles:
433  // 1) Extended doesn't support backref, but basic does.
434  template<typename _FwdIter>
435  void
436  _Scanner<_FwdIter>::
437  _M_eat_escape_posix()
438  {
439  if (_M_current == _M_end)
440  __throw_regex_error(regex_constants::error_escape);
441 
442  auto __c = *_M_current;
443 
444  if (_M_spec_char.count(_M_ctype.narrow(__c, '\0')))
445  {
446  _M_token = _S_token_ord_char;
447  _M_value.assign(1, __c);
448  }
449  // We MUST judge awk before handling backrefs. There's no backref in awk.
450  else if (_M_is_awk())
451  {
452  _M_eat_escape_awk();
453  return;
454  }
455  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
456  {
457  _M_token = _S_token_backref;
458  _M_value.assign(1, __c);
459  }
460  else
461  {
462 #ifdef __STRICT_ANSI__
463  __throw_regex_error(regex_constants::error_escape);
464 #else
465  _M_token = _S_token_ord_char;
466  _M_value.assign(1, __c);
467 #endif
468  }
469  ++_M_current;
470  }
471 
472  template<typename _FwdIter>
473  void
474  _Scanner<_FwdIter>::
475  _M_eat_escape_awk()
476  {
477  auto __c = *_M_current++;
478 
479  if (_M_escape_map.count(_M_ctype.narrow(__c, '\0')))
480  {
481  _M_token = _S_token_ord_char;
482  _M_value.assign(1, _M_escape_map.at(__c));
483  }
484  // \ddd for oct representation
485  else if (_M_ctype.is(_CtypeT::digit, __c)
486  && __c != '8'
487  && __c != '9')
488  {
489  _M_value.assign(1, __c);
490  for (int __i = 0;
491  __i < 2
492  && _M_current != _M_end
493  && _M_ctype.is(_CtypeT::digit, *_M_current)
494  && *_M_current != '8'
495  && *_M_current != '9';
496  __i++)
497  _M_value += *_M_current++;
498  _M_token = _S_token_oct_num;
499  return;
500  }
501  else
502  __throw_regex_error(regex_constants::error_escape);
503  }
504 
505  // Eats a character class or throwns an exception.
506  // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
507  // returning.
508  template<typename _FwdIter>
509  void
510  _Scanner<_FwdIter>::
511  _M_eat_class(char __ch)
512  {
513  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
514  _M_value += *_M_current++;
515  if (_M_current == _M_end
516  || *_M_current++ != __ch
517  || _M_current == _M_end // skip __ch
518  || *_M_current++ != ']') // skip ']'
519  {
520  if (__ch == ':')
521  __throw_regex_error(regex_constants::error_ctype);
522  else
523  __throw_regex_error(regex_constants::error_collate);
524  }
525  }
526 
527 #ifdef _GLIBCXX_DEBUG
528  template<typename _FwdIter>
529  std::ostream&
530  _Scanner<_FwdIter>::
531  _M_print(std::ostream& ostr)
532  {
533  switch (_M_token)
534  {
535  case _S_token_anychar:
536  ostr << "any-character\n";
537  break;
538  case _S_token_backref:
539  ostr << "backref\n";
540  break;
541  case _S_token_bracket_begin:
542  ostr << "bracket-begin\n";
543  break;
544  case _S_token_bracket_neg_begin:
545  ostr << "bracket-neg-begin\n";
546  break;
547  case _S_token_bracket_end:
548  ostr << "bracket-end\n";
549  break;
550  case _S_token_char_class_name:
551  ostr << "char-class-name \"" << _M_value << "\"\n";
552  break;
553  case _S_token_closure0:
554  ostr << "closure0\n";
555  break;
556  case _S_token_closure1:
557  ostr << "closure1\n";
558  break;
559  case _S_token_collsymbol:
560  ostr << "collsymbol \"" << _M_value << "\"\n";
561  break;
562  case _S_token_comma:
563  ostr << "comma\n";
564  break;
565  case _S_token_dup_count:
566  ostr << "dup count: " << _M_value << "\n";
567  break;
568  case _S_token_eof:
569  ostr << "EOF\n";
570  break;
571  case _S_token_equiv_class_name:
572  ostr << "equiv-class-name \"" << _M_value << "\"\n";
573  break;
574  case _S_token_interval_begin:
575  ostr << "interval begin\n";
576  break;
577  case _S_token_interval_end:
578  ostr << "interval end\n";
579  break;
580  case _S_token_line_begin:
581  ostr << "line begin\n";
582  break;
583  case _S_token_line_end:
584  ostr << "line end\n";
585  break;
586  case _S_token_opt:
587  ostr << "opt\n";
588  break;
589  case _S_token_or:
590  ostr << "or\n";
591  break;
592  case _S_token_ord_char:
593  ostr << "ordinary character: \"" << _M_value << "\"\n";
594  break;
595  case _S_token_subexpr_begin:
596  ostr << "subexpr begin\n";
597  break;
598  case _S_token_subexpr_no_group_begin:
599  ostr << "no grouping subexpr begin\n";
600  break;
601  case _S_token_subexpr_lookahead_begin:
602  ostr << "lookahead subexpr begin\n";
603  break;
604  case _S_token_subexpr_end:
605  ostr << "subexpr end\n";
606  break;
607  case _S_token_unknown:
608  ostr << "-- unknown token --\n";
609  break;
610  case _S_token_oct_num:
611  ostr << "oct number " << _M_value << "\n";
612  break;
613  case _S_token_hex_num:
614  ostr << "hex number " << _M_value << "\n";
615  break;
616  case _S_token_quoted_class:
617  ostr << "quoted class " << "\\" << _M_value << "\n";
618  break;
619  default:
620  _GLIBCXX_DEBUG_ASSERT(false);
621  }
622  return ostr;
623  }
624 #endif
625 
626 _GLIBCXX_END_NAMESPACE_VERSION
627 } // namespace __detail
628 } // namespace
constexpr error_type error_ctype(_S_error_ctype)
const _Facet & use_facet(const locale &__loc)
Return a facet.use_facet looks for and returns a reference to a facet of type Facet where Facet is th...
Container class for localization functionality.The locale class is first a class wrapper for C librar...
constexpr error_type error_brack(_S_error_brack)
constexpr error_type error_badbrace(_S_error_badbrace)
constexpr error_type error_collate(_S_error_collate)
constexpr error_type error_paren(_S_error_paren)
constexpr error_type error_escape(_S_error_escape)
constexpr error_type error_brace(_S_error_brace)