Jim Tcl
Check-in [fd0eb63ebd]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:utf8: Be more strict at rejecting invalid UTF-8 sequences.

RFC 3629 says:

Implementations of the decoding algorithm above MUST protect against decoding invalid sequences

Signed-off-by: Steve Bennett <steveb@workware.net.au>

Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:fd0eb63ebd91397b0caeec45da733e52d51c9278
User & Date: steveb@workware.net.au 2017-12-31 01:47:54
Original User & Date: steveb@workware.net.au 2017-12-31 01:47:55
Context
2017-12-31
01:47
linenoise: Update to fix potential buffer overflow

From https://github.com/msteveb/linenoise/commit/a4545af5e3766c58100be6bf406b9a0d2049090f

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: 29cb9ccf5a user: steveb@workware.net.au tags: trunk

01:47
utf8: Be more strict at rejecting invalid UTF-8 sequences.

RFC 3629 says:

Implementations of the decoding algorithm above MUST protect against decoding invalid sequences

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: fd0eb63ebd user: steveb@workware.net.au tags: trunk

01:45
regexp: Implement class shorthand escapes in brackets

The following class shorthand escapes now match Tcl when used within bracket expressions:

\d [[:digit:]] \s [[:space:]] \w [[:alnum:]_] (note underscore)

e.g. [a-f\d] => [a-f0-9]

Previously these shorthand escapes were only implemented outside bracket expressions.

Signed-off-by: Steve Bennett <steveb@workware.net.au> check-in: aa2e5832cf user: steveb@workware.net.au tags: trunk

Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to tests/lsort.test.

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
	foreach val [lsort -int -unique {0 5 05 00 004 4}] {
		lappend vallist [expr int($val)]
	}
	set vallist
} {0 4 5}

test lsort-4.26 {DefaultCompare procedure, signed characters} utf8 {
    set l [lsort [list "abc\u80" "abc"]]
    set viewlist {}
    foreach s $l {
	set viewelem ""
	set len [string length $s]
	for {set i 0} {$i < $len} {incr i} {
	    set c [string index $s $i]
	    scan $c %c d
	    if {$d > 0 && $d < 128} {
		append viewelem $c
	    } else {
		append viewelem "\\[format %03o [expr {$d & 0xff}]]"
	    }
	}
	lappend viewlist $viewelem
    }
    set viewlist
} [list "abc" "abc\\200"]

test lsort-5.1 "Sort case insensitive" {
    lsort -nocase {ba aB aa ce}
} {aa aB ba ce}

testreport







|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|






199
200
201
202
203
204
205
206
















207
208
209
210
211
212
213
	foreach val [lsort -int -unique {0 5 05 00 004 4}] {
		lappend vallist [expr int($val)]
	}
	set vallist
} {0 4 5}

test lsort-4.26 {DefaultCompare procedure, signed characters} utf8 {
    lsort [list "abc\u80" "abc"]
















} [list "abc" "abc\u80"]

test lsort-5.1 "Sort case insensitive" {
    lsort -nocase {ba aB aa ce}
} {aa aB ba ce}

testreport

Changes to tests/utftcl.test.

70
71
72
73
74
75
76

77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
} {0}
test utf-4.2 {Tcl_NumUtfChars: length 1} {
    testnumutfchars [bytestring "\xC2\xA2"]
} {1}
test utf-4.3 {Tcl_NumUtfChars: long string} {
    testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
} {7}

test utf-4.4 {Tcl_NumUtfChars: #u0000} {
    testnumutfchars [bytestring "\xC0\x80"]
} {1}
test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} {
    testnumutfchars "" 1
} {0}
test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {
    testnumutfchars [bytestring "\xC2\xA2"] 1
} {1}
test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {
    testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1
} {7}
test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {
    testnumutfchars [bytestring "\xC0\x80"] 1
} {1}

test utf-5.1 {Tcl_UtfFindFirsts} {
} {}

test utf-6.1 {Tcl_UtfNext} {







>
|











|







70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
} {0}
test utf-4.2 {Tcl_NumUtfChars: length 1} {
    testnumutfchars [bytestring "\xC2\xA2"]
} {1}
test utf-4.3 {Tcl_NumUtfChars: long string} {
    testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
} {7}
# This is an invalid utf-8 sequence. Not minimal, so should return 2
test utf-4.4 {Tcl_NumUtfChars: #u0000} tcl {
    testnumutfchars [bytestring "\xC0\x80"]
} {1}
test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} {
    testnumutfchars "" 1
} {0}
test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {
    testnumutfchars [bytestring "\xC2\xA2"] 1
} {1}
test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {
    testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1
} {7}
test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} tcl {
    testnumutfchars [bytestring "\xC0\x80"] 1
} {1}

test utf-5.1 {Tcl_UtfFindFirsts} {
} {}

test utf-6.1 {Tcl_UtfNext} {

Changes to utf8.c.

127
128
129
130
131
132
133

134


135
136
137
138
139

140


141
142
143
144
145

146


147
148
149
150
151
152
153
    if (s[0] < 0xc0) {
        *uc = s[0];
        return 1;
    }
    if (s[0] < 0xe0) {
        if ((s[1] & 0xc0) == 0x80) {
            *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);

            return 2;


        }
    }
    else if (s[0] < 0xf0) {
        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
            *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);

            return 3;


        }
    }
    else if (s[0] < 0xf8) {
        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);

            return 4;


        }
    }

    /* Invalid sequence, so just return the byte */
    *uc = *s;
    return 1;
}







>
|
>
>





>
|
>
>





>
|
>
>







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    if (s[0] < 0xc0) {
        *uc = s[0];
        return 1;
    }
    if (s[0] < 0xe0) {
        if ((s[1] & 0xc0) == 0x80) {
            *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);
            if (*uc >= 0x80) {
                return 2;
            }
            /* Otherwise this is an invalid sequence */
        }
    }
    else if (s[0] < 0xf0) {
        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
            *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
            if (*uc >= 0x800) {
                return 3;
            }
            /* Otherwise this is an invalid sequence */
        }
    }
    else if (s[0] < 0xf8) {
        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
            if (*uc >= 0x10000) {
                return 4;
            }
            /* Otherwise this is an invalid sequence */
        }
    }

    /* Invalid sequence, so just return the byte */
    *uc = *s;
    return 1;
}