blob: 5686d6fb59ce81c7b1670810929b281932ca851a [file] [log] [blame]
Tom Rinif739fcd2018-05-07 17:02:21 -04001// SPDX-License-Identifier: GPL-2.0+
Rob Clark78178bb2017-09-09 06:47:40 -04002/*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
Rob Clark78178bb2017-09-09 06:47:40 -04006 */
7
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +02008#include <common.h>
Rob Clark78178bb2017-09-09 06:47:40 -04009#include <charset.h>
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020010#include <capitalization.h>
Rob Clark78178bb2017-09-09 06:47:40 -040011#include <malloc.h>
12
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020013static struct capitalization_table capitalization_table[] =
14#ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15 UNICODE_CAPITALIZATION_TABLE;
16#elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17 CP1250_CAPITALIZATION_TABLE;
18#else
19 CP437_CAPITALIZATION_TABLE;
20#endif
21
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020022/**
23 * get_code() - read Unicode code point from UTF-8 stream
24 *
25 * @read_u8: - stream reader
26 * @src: - string buffer passed to stream reader, optional
27 * Return: - Unicode code point
28 */
29static int get_code(u8 (*read_u8)(void *data), void *data)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020030{
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020031 s32 ch = 0;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020032
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020033 ch = read_u8(data);
34 if (!ch)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020035 return 0;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020036 if (ch >= 0xc2 && ch <= 0xf4) {
37 int code = 0;
38
39 if (ch >= 0xe0) {
40 if (ch >= 0xf0) {
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020041 /* 0xf0 - 0xf4 */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020042 ch &= 0x07;
43 code = ch << 18;
44 ch = read_u8(data);
45 if (ch < 0x80 || ch > 0xbf)
46 goto error;
47 ch &= 0x3f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020048 } else {
49 /* 0xe0 - 0xef */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020050 ch &= 0x0f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020051 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020052 code += ch << 12;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020053 if ((code >= 0xD800 && code <= 0xDFFF) ||
54 code >= 0x110000)
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020055 goto error;
56 ch = read_u8(data);
57 if (ch < 0x80 || ch > 0xbf)
58 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020059 }
60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020061 ch &= 0x3f;
62 code += ch << 6;
63 ch = read_u8(data);
64 if (ch < 0x80 || ch > 0xbf)
65 goto error;
66 ch &= 0x3f;
67 ch += code;
68 } else if (ch >= 0x80) {
69 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020070 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020071 return ch;
72error:
73 return '?';
74}
75
76/**
77 * read_string() - read byte from character string
78 *
79 * @data: - pointer to string
80 * Return: - byte read
81 *
82 * The string pointer is incremented if it does not point to '\0'.
83 */
84static u8 read_string(void *data)
85
86{
87 const char **src = (const char **)data;
88 u8 c;
89
90 if (!src || !*src || !**src)
91 return 0;
92 c = **src;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020093 ++*src;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020094 return c;
95}
96
97/**
98 * read_console() - read byte from console
99 *
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200100 * @data - not used, needed to match interface
101 * Return: - byte read or 0 on error
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200102 */
103static u8 read_console(void *data)
104{
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200105 int ch;
106
Heinrich Schuchardtc670aee2020-10-07 18:11:48 +0200107 ch = getchar();
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200108 if (ch < 0)
109 ch = 0;
110 return ch;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200111}
112
113int console_read_unicode(s32 *code)
114{
115 if (!tstc()) {
116 /* No input available */
117 return 1;
118 }
119
120 /* Read Unicode code */
121 *code = get_code(read_console, NULL);
122 return 0;
123}
124
125s32 utf8_get(const char **src)
126{
127 return get_code(read_string, src);
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200128}
129
130int utf8_put(s32 code, char **dst)
131{
132 if (!dst || !*dst)
133 return -1;
134 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
135 return -1;
136 if (code <= 0x007F) {
137 **dst = code;
138 } else {
139 if (code <= 0x07FF) {
140 **dst = code >> 6 | 0xC0;
141 } else {
142 if (code < 0x10000) {
143 **dst = code >> 12 | 0xE0;
144 } else {
145 **dst = code >> 18 | 0xF0;
146 ++*dst;
147 **dst = (code >> 12 & 0x3F) | 0x80;
148 }
149 ++*dst;
150 **dst = (code >> 6 & 0x3F) | 0x80;
151 }
152 ++*dst;
153 **dst = (code & 0x3F) | 0x80;
154 }
155 ++*dst;
156 return 0;
157}
158
159size_t utf8_utf16_strnlen(const char *src, size_t count)
160{
161 size_t len = 0;
162
163 for (; *src && count; --count) {
164 s32 code = utf8_get(&src);
165
166 if (!code)
167 break;
168 if (code < 0) {
169 /* Reserve space for a replacement character */
170 len += 1;
171 } else if (code < 0x10000) {
172 len += 1;
173 } else {
174 len += 2;
175 }
176 }
177 return len;
178}
179
180int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
181{
182 if (!src || !dst || !*dst)
183 return -1;
184
185 for (; count && *src; --count) {
186 s32 code = utf8_get(&src);
187
188 if (code < 0)
189 code = '?';
190 utf16_put(code, dst);
191 }
192 **dst = 0;
193 return 0;
194}
195
196s32 utf16_get(const u16 **src)
197{
198 s32 code, code2;
199
200 if (!src || !*src)
201 return -1;
202 if (!**src)
203 return 0;
204 code = **src;
205 ++*src;
206 if (code >= 0xDC00 && code <= 0xDFFF)
207 return -1;
208 if (code >= 0xD800 && code <= 0xDBFF) {
209 if (!**src)
210 return -1;
211 code &= 0x3ff;
212 code <<= 10;
213 code += 0x10000;
214 code2 = **src;
215 ++*src;
216 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
217 return -1;
218 code2 &= 0x3ff;
219 code += code2;
220 }
221 return code;
222}
223
224int utf16_put(s32 code, u16 **dst)
225{
226 if (!dst || !*dst)
227 return -1;
228 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
229 return -1;
230 if (code < 0x10000) {
231 **dst = code;
232 } else {
233 code -= 0x10000;
234 **dst = code >> 10 | 0xD800;
235 ++*dst;
236 **dst = (code & 0x3ff) | 0xDC00;
237 }
238 ++*dst;
239 return 0;
240}
241
242size_t utf16_strnlen(const u16 *src, size_t count)
243{
244 size_t len = 0;
245
246 for (; *src && count; --count) {
247 s32 code = utf16_get(&src);
248
249 if (!code)
250 break;
251 /*
252 * In case of an illegal sequence still reserve space for a
253 * replacement character.
254 */
255 ++len;
256 }
257 return len;
258}
259
260size_t utf16_utf8_strnlen(const u16 *src, size_t count)
261{
262 size_t len = 0;
263
264 for (; *src && count; --count) {
265 s32 code = utf16_get(&src);
266
267 if (!code)
268 break;
269 if (code < 0)
270 /* Reserve space for a replacement character */
271 len += 1;
272 else if (code < 0x80)
273 len += 1;
274 else if (code < 0x800)
275 len += 2;
276 else if (code < 0x10000)
277 len += 3;
278 else
279 len += 4;
280 }
281 return len;
282}
283
284int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
285{
286 if (!src || !dst || !*dst)
287 return -1;
288
289 for (; count && *src; --count) {
290 s32 code = utf16_get(&src);
291
292 if (code < 0)
293 code = '?';
294 utf8_put(code, dst);
295 }
296 **dst = 0;
297 return 0;
298}
299
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +0200300s32 utf_to_lower(const s32 code)
301{
302 struct capitalization_table *pos = capitalization_table;
303 s32 ret = code;
304
305 if (code <= 0x7f) {
306 if (code >= 'A' && code <= 'Z')
307 ret += 0x20;
308 return ret;
309 }
310 for (; pos->upper; ++pos) {
311 if (pos->upper == code) {
312 ret = pos->lower;
313 break;
314 }
315 }
316 return ret;
317}
318
319s32 utf_to_upper(const s32 code)
320{
321 struct capitalization_table *pos = capitalization_table;
322 s32 ret = code;
323
324 if (code <= 0x7f) {
325 if (code >= 'a' && code <= 'z')
326 ret -= 0x20;
327 return ret;
328 }
329 for (; pos->lower; ++pos) {
330 if (pos->lower == code) {
331 ret = pos->upper;
332 break;
333 }
334 }
335 return ret;
336}
Rob Clark78178bb2017-09-09 06:47:40 -0400337
AKASHI Takahirof8062c92019-09-18 10:26:29 +0900338/*
339 * u16_strncmp() - compare two u16 string
340 *
341 * @s1: first string to compare
342 * @s2: second string to compare
343 * @n: maximum number of u16 to compare
344 * Return: 0 if the first n u16 are the same in s1 and s2
345 * < 0 if the first different u16 in s1 is less than the
346 * corresponding u16 in s2
347 * > 0 if the first different u16 in s1 is greater than the
348 * corresponding u16 in s2
349 */
350int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
351{
352 int ret = 0;
353
354 for (; n; --n, ++s1, ++s2) {
355 ret = *s1 - *s2;
356 if (ret || !*s1)
357 break;
358 }
359
360 return ret;
361}
362
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200363size_t u16_strlen(const void *in)
Rob Clark78178bb2017-09-09 06:47:40 -0400364{
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200365 const char *pos = in;
366 size_t ret;
367
368 for (; pos[0] || pos[1]; pos += 2)
369 ;
370 ret = pos - (char *)in;
371 ret >>= 1;
372 return ret;
Rob Clark78178bb2017-09-09 06:47:40 -0400373}
374
Heinrich Schuchardt1dde0d52018-08-31 21:31:26 +0200375size_t u16_strnlen(const u16 *in, size_t count)
Rob Clark78178bb2017-09-09 06:47:40 -0400376{
377 size_t i;
378 for (i = 0; count-- && in[i]; i++);
379 return i;
380}
381
Sughosh Ganu4835d352020-05-06 22:12:41 +0300382size_t u16_strsize(const void *in)
383{
384 return (u16_strlen(in) + 1) * sizeof(u16);
385}
386
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900387u16 *u16_strcpy(u16 *dest, const u16 *src)
388{
389 u16 *tmp = dest;
390
391 for (;; dest++, src++) {
392 *dest = *src;
393 if (!*src)
394 break;
395 }
396
397 return tmp;
398}
399
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200400u16 *u16_strdup(const void *src)
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900401{
402 u16 *new;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200403 size_t len;
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900404
405 if (!src)
406 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200407 len = (u16_strlen(src) + 1) * sizeof(u16);
408 new = malloc(len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900409 if (!new)
410 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200411 memcpy(new, src, len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900412
413 return new;
414}
415
Rob Clark78178bb2017-09-09 06:47:40 -0400416/* Convert UTF-16 to UTF-8. */
417uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
418{
419 uint32_t code_high = 0;
420
421 while (size--) {
422 uint32_t code = *src++;
423
424 if (code_high) {
425 if (code >= 0xDC00 && code <= 0xDFFF) {
426 /* Surrogate pair. */
427 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
428
429 *dest++ = (code >> 18) | 0xF0;
430 *dest++ = ((code >> 12) & 0x3F) | 0x80;
431 *dest++ = ((code >> 6) & 0x3F) | 0x80;
432 *dest++ = (code & 0x3F) | 0x80;
433 } else {
434 /* Error... */
435 *dest++ = '?';
436 /* *src may be valid. Don't eat it. */
437 src--;
438 }
439
440 code_high = 0;
441 } else {
442 if (code <= 0x007F) {
443 *dest++ = code;
444 } else if (code <= 0x07FF) {
445 *dest++ = (code >> 6) | 0xC0;
446 *dest++ = (code & 0x3F) | 0x80;
447 } else if (code >= 0xD800 && code <= 0xDBFF) {
448 code_high = code;
449 continue;
450 } else if (code >= 0xDC00 && code <= 0xDFFF) {
451 /* Error... */
452 *dest++ = '?';
453 } else if (code < 0x10000) {
454 *dest++ = (code >> 12) | 0xE0;
455 *dest++ = ((code >> 6) & 0x3F) | 0x80;
456 *dest++ = (code & 0x3F) | 0x80;
457 } else {
458 *dest++ = (code >> 18) | 0xF0;
459 *dest++ = ((code >> 12) & 0x3F) | 0x80;
460 *dest++ = ((code >> 6) & 0x3F) | 0x80;
461 *dest++ = (code & 0x3F) | 0x80;
462 }
463 }
464 }
465
466 return dest;
467}