Differences From Artifact [3fb976e2e4f03938]:
- File
utf8.h
-
2011-11-28 03:13:40
- part of checkin
[fa6ea5bdaa]
on branch trunk
- Add support for [string totitle]
Signed-off-by: Steve Bennett <steveb@workware.net.au> (user: steveb@workware.net.au
-
2011-11-28 03:13:40
- part of checkin
[fa6ea5bdaa]
on branch trunk
- Add support for [string totitle]
To Artifact [13a1a88b291e1b31]:
- File
utf8.h
-
2011-12-02 10:56:49
- part of checkin
[16094543f9]
on branch trunk
- Extend UTF-8 support past the BMP
Now codepoints up to U+1FFFFF are supported, including as literals with the new \u{NNNNNN} syntax (up to six hex digits)
Signed-off-by: Steve Bennett <steveb@workware.net.au> (user: steveb@workware.net.au
-
2011-12-02 10:56:49
- part of checkin
[16094543f9]
on branch trunk
- Extend UTF-8 support past the BMP
5 5 *
6 6 * (c) 2010 Steve Bennett <steveb@workware.net.au>
7 7 *
8 8 * See LICENCE for licence details.
9 9 */
10 10 #include <jim-config.h>
11 11
12 +/* Currently we support unicode points up to 2^22-1 */
13 +#define MAX_UTF8_LEN 4
14 +
12 15 /**
13 - * Converts the given unicode codepoint (0 - 0xffff) to utf-8
16 + * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
14 17 * and stores the result at 'p'.
15 - *
16 - * Returns the number of utf-8 characters (1-3).
18 + *
19 + * Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
17 20 */
18 -int utf8_fromunicode(char *p, unsigned short uc);
21 +int utf8_fromunicode(char *p, unsigned uc);
19 22
20 23 #ifndef JIM_UTF8
21 24 #include <ctype.h>
22 25
23 26 /* No utf-8 support. 1 byte = 1 char */
24 27 #define utf8_strlen(S, B) ((B) < 0 ? strlen(S) : (B))
25 28 #define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
................................................................................
46 49 * string of the given byte length.
47 50 *
48 51 * Any bytes which are not part of an valid utf-8
49 52 * sequence are treated as individual characters.
50 53 *
51 54 * The string *must* be null terminated.
52 55 *
53 - * Does not support unicode code points > \uffff
56 + * Does not support unicode code points > \u1fffff
54 57 */
55 58 int utf8_strlen(const char *str, int bytelen);
56 59
57 60 /**
58 61 * Returns the byte index of the given character in the utf-8 string.
59 62 *
60 63 * The string *must* be null terminated.
................................................................................
72 75 * consumed.
73 76 *
74 77 * If 'str' is null terminated, then an invalid utf-8 sequence
75 78 * at the end of the string will be returned as individual bytes.
76 79 *
77 80 * If it is not null terminated, the length *must* be checked first.
78 81 *
79 - * Does not support unicode code points > \uffff
82 + * Does not support unicode code points > \u1fffff
80 83 */
81 84 int utf8_tounicode(const char *str, int *uc);
82 85
83 86 /**
84 87 * Returns the number of bytes before 'str' that the previous
85 88 * utf-8 character sequence starts (which may be the middle of a sequence).
86 89 *
................................................................................
88 91 * If no start char is found, returns -len
89 92 */
90 93 int utf8_prev_len(const char *str, int len);
91 94
92 95 /**
93 96 * Returns the upper-case variant of the given unicode codepoint.
94 97 *
95 - * Does not support unicode code points > \uffff
98 + * Unicode code points > \uffff are returned unchanged.
96 99 */
97 100 int utf8_upper(int uc);
98 101
99 102 /**
100 103 * Returns the title-case variant of the given unicode codepoint.
101 104 *
102 105 * If none, returns utf8_upper().
................................................................................
106 109 int utf8_title(int uc);
107 110
108 111 /**
109 112 * Returns the lower-case variant of the given unicode codepoint.
110 113 *
111 114 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
112 115 *
113 - * Does not support unicode code points > \uffff
116 + * Unicode code points > \uffff are returned unchanged.
114 117 */
115 118 int utf8_lower(int uc);
116 119 #endif /* JIM_BOOTSTRAP */
117 120
118 121 #endif
119 122
120 123 #endif