Jim Tcl
Check-in [aa2e5832cf]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:regexp: Implement class shorthand escapes in brackets

The following class shorthand escapes now match Tcl when used within bracket expressions:

\d [[:digit:]] \s [[:space:]] \w [[:alnum:]_] (note underscore)

e.g. [a-f\d] => [a-f0-9]

Previously these shorthand escapes were only implemented outside bracket expressions.

Signed-off-by: Steve Bennett <steveb@workware.net.au>

Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:aa2e5832cf425d8797018ff2b06501aa37729829
User & Date: steveb@workware.net.au 2017-12-31 01:45:53
Context
2017-12-31
01:47
utf8: Be more strict at rejecting invalid UTF-8 sequences.

RFC 3629 says:

Implementations of the decoding algorithm above MUST protect against decoding invalid sequences

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: fd0eb63ebd user: steveb@workware.net.au tags: trunk

01:45
regexp: Implement class shorthand escapes in brackets

The following class shorthand escapes now match Tcl when used within bracket expressions:

\d [[:digit:]] \s [[:space:]] \w [[:alnum:]_] (note underscore)

e.g. [a-f\d] => [a-f0-9]

Previously these shorthand escapes were only implemented outside bracket expressions.

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: aa2e5832cf user: steveb@workware.net.au tags: trunk

2017-12-02
09:51
jim.c: Various minor cleanups

Small improvements to code for clarity and code reduction. Improve comments.

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: b8e69cbcb0 user: steveb@workware.net.au tags: trunk

Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to jimregexp.c.

720
721
722
723
724
725
726







727
728
















729
730
731
732
733
734
735
...
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762

763
764
765
766
767
768
769
770

771
772
773
774
775
776
777
778
			}

			while (*pattern && *pattern != ']') {
				/* Is this a range? a-z */
				int start;
				int end;








				pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
				if (start == '\\') {
















					pattern += reg_decode_escape(pattern, &start);
					if (start == 0) {
						preg->err = REG_ERR_NULL_CHAR;
						return 0;
					}
				}
				if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
................................................................................
					continue;
				}
				if (start == '[' && pattern[0] == ':') {
					static const char *character_class[] = {
						":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
						":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
					};
					enum {
						CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
						CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
						CC_NUM
					};
					int i;

					for (i = 0; i < CC_NUM; i++) {

						n = strlen(character_class[i]);
						if (strncmp(pattern, character_class[i], n) == 0) {
							/* Found a character class */
							pattern += n + 1;
							break;
						}
					}
					if (i != CC_NUM) {

						switch (i) {
							case CC_ALNUM:
								reg_addrange(preg, '0', '9');
								/* Fall through */
							case CC_ALPHA:
								if ((preg->cflags & REG_ICASE) == 0) {
									reg_addrange(preg, 'a', 'z');
								}







>
>
>
>
>
>
>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
<
<
<
<
<

<
>
|
|





|
>
|







720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
...
771
772
773
774
775
776
777






778

779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
			}

			while (*pattern && *pattern != ']') {
				/* Is this a range? a-z */
				int start;
				int end;

				enum {
					CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
					CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
					CC_NUM
				};
				int cc;

				pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
				if (start == '\\') {
					/* First check for class shorthand escapes */
					switch (*pattern) {
						case 's':
							pattern++;
							cc = CC_SPACE;
							goto cc_switch;
						case 'd':
							pattern++;
							cc = CC_DIGIT;
							goto cc_switch;
						case 'w':
							pattern++;
							reg_addrange(preg, '_', '_');
							cc = CC_ALNUM;
							goto cc_switch;
					}
					pattern += reg_decode_escape(pattern, &start);
					if (start == 0) {
						preg->err = REG_ERR_NULL_CHAR;
						return 0;
					}
				}
				if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
................................................................................
					continue;
				}
				if (start == '[' && pattern[0] == ':') {
					static const char *character_class[] = {
						":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
						":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
					};








					for (cc = 0; cc < CC_NUM; cc++) {
						n = strlen(character_class[cc]);
						if (strncmp(pattern, character_class[cc], n) == 0) {
							/* Found a character class */
							pattern += n + 1;
							break;
						}
					}
					if (cc != CC_NUM) {
cc_switch:
						switch (cc) {
							case CC_ALNUM:
								reg_addrange(preg, '0', '9');
								/* Fall through */
							case CC_ALPHA:
								if ((preg->cflags & REG_ICASE) == 0) {
									reg_addrange(preg, 'a', 'z');
								}

Changes to tests/regcount.test.

80
81
82
83
84
85
86

87
88
89
90
91
92
93
	(a|y){2,5}? baaaad {aa a}
	(a|y){1,3}? baaaad {a a}
	(a|y){1,2}? baaaad {a a}
	(a|y){3,4}? baaaad {aaa a}
	(a|y){5,6}? baaaad {}
    {[[:alpha:]]+} _bcd56_ef bcd
    {[[:alnum:]]+} _bcd56_ef bcd56

    {[[:space:]]+} "_bc \t\r\n\f\v_" "{ \t\r\n\f\v}"
    {[\x41-\x43]+} "_ABCD_" ABC
    {\m.+\M} "#A test#" "{A test}"
    {\m.+?\M} "#A test#" "A"
    {\m\M} "a" ""
    {ab*c} xnbbmbbbc {}
    {.^xxx} yyy {}







>







80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
	(a|y){2,5}? baaaad {aa a}
	(a|y){1,3}? baaaad {a a}
	(a|y){1,2}? baaaad {a a}
	(a|y){3,4}? baaaad {aaa a}
	(a|y){5,6}? baaaad {}
    {[[:alpha:]]+} _bcd56_ef bcd
    {[[:alnum:]]+} _bcd56_ef bcd56
    {[\w]+} :_bcd56_ef _bcd56_ef
    {[[:space:]]+} "_bc \t\r\n\f\v_" "{ \t\r\n\f\v}"
    {[\x41-\x43]+} "_ABCD_" ABC
    {\m.+\M} "#A test#" "{A test}"
    {\m.+?\M} "#A test#" "A"
    {\m\M} "a" ""
    {ab*c} xnbbmbbbc {}
    {.^xxx} yyy {}