Jim Tcl
Diff
Not logged in

Differences From Artifact [3fb976e2e4f03938]:

To Artifact [13a1a88b291e1b31]:


5 5 * 6 6 * (c) 2010 Steve Bennett <steveb@workware.net.au> 7 7 * 8 8 * See LICENCE for licence details. 9 9 */ 10 10 #include <jim-config.h> 11 11 12 +/* Currently we support unicode points up to 2^22-1 */ 13 +#define MAX_UTF8_LEN 4 14 + 12 15 /** 13 - * Converts the given unicode codepoint (0 - 0xffff) to utf-8 16 + * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8 14 17 * and stores the result at 'p'. 15 - * 16 - * Returns the number of utf-8 characters (1-3). 18 + * 19 + * Returns the number of utf-8 characters (up to MAX_UTF8_LEN). 17 20 */ 18 -int utf8_fromunicode(char *p, unsigned short uc); 21 +int utf8_fromunicode(char *p, unsigned uc); 19 22 20 23 #ifndef JIM_UTF8 21 24 #include <ctype.h> 22 25 23 26 /* No utf-8 support. 1 byte = 1 char */ 24 27 #define utf8_strlen(S, B) ((B) < 0 ? strlen(S) : (B)) 25 28 #define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1) ................................................................................ 46 49 * string of the given byte length. 47 50 * 48 51 * Any bytes which are not part of an valid utf-8 49 52 * sequence are treated as individual characters. 50 53 * 51 54 * The string *must* be null terminated. 52 55 * 53 - * Does not support unicode code points > \uffff 56 + * Does not support unicode code points > \u1fffff 54 57 */ 55 58 int utf8_strlen(const char *str, int bytelen); 56 59 57 60 /** 58 61 * Returns the byte index of the given character in the utf-8 string. 59 62 * 60 63 * The string *must* be null terminated. ................................................................................ 72 75 * consumed. 73 76 * 74 77 * If 'str' is null terminated, then an invalid utf-8 sequence 75 78 * at the end of the string will be returned as individual bytes. 76 79 * 77 80 * If it is not null terminated, the length *must* be checked first. 78 81 * 79 - * Does not support unicode code points > \uffff 82 + * Does not support unicode code points > \u1fffff 80 83 */ 81 84 int utf8_tounicode(const char *str, int *uc); 82 85 83 86 /** 84 87 * Returns the number of bytes before 'str' that the previous 85 88 * utf-8 character sequence starts (which may be the middle of a sequence). 86 89 * ................................................................................ 88 91 * If no start char is found, returns -len 89 92 */ 90 93 int utf8_prev_len(const char *str, int len); 91 94 92 95 /** 93 96 * Returns the upper-case variant of the given unicode codepoint. 94 97 * 95 - * Does not support unicode code points > \uffff 98 + * Unicode code points > \uffff are returned unchanged. 96 99 */ 97 100 int utf8_upper(int uc); 98 101 99 102 /** 100 103 * Returns the title-case variant of the given unicode codepoint. 101 104 * 102 105 * If none, returns utf8_upper(). ................................................................................ 106 109 int utf8_title(int uc); 107 110 108 111 /** 109 112 * Returns the lower-case variant of the given unicode codepoint. 110 113 * 111 114 * NOTE: Use utf8_upper() in preference for case-insensitive matching. 112 115 * 113 - * Does not support unicode code points > \uffff 116 + * Unicode code points > \uffff are returned unchanged. 114 117 */ 115 118 int utf8_lower(int uc); 116 119 #endif /* JIM_BOOTSTRAP */ 117 120 118 121 #endif 119 122 120 123 #endif