blob: 2b43175b1d982af5f8d5ffcf9042dc3464ab66a7 [file] [log] [blame]
Tom Rinif739fcd2018-05-07 17:02:21 -04001// SPDX-License-Identifier: GPL-2.0+
Rob Clark78178bb2017-09-09 06:47:40 -04002/*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
Rob Clark78178bb2017-09-09 06:47:40 -04006 */
7
Rob Clark78178bb2017-09-09 06:47:40 -04008#include <charset.h>
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +02009#include <capitalization.h>
Heinrich Schuchardt70616a12021-02-27 14:08:35 +010010#include <cp437.h>
Ilias Apalodimas6974a4a2020-11-22 15:10:26 +020011#include <efi_loader.h>
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +010012#include <errno.h>
Rob Clark78178bb2017-09-09 06:47:40 -040013#include <malloc.h>
14
Heinrich Schuchardt70616a12021-02-27 14:08:35 +010015/**
16 * codepage_437 - Unicode to codepage 437 translation table
17 */
18const u16 codepage_437[128] = CP437;
19
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +020020static struct capitalization_table capitalization_table[] =
21#ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
22 UNICODE_CAPITALIZATION_TABLE;
23#elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
24 CP1250_CAPITALIZATION_TABLE;
25#else
26 CP437_CAPITALIZATION_TABLE;
27#endif
28
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020029/**
30 * get_code() - read Unicode code point from UTF-8 stream
31 *
32 * @read_u8: - stream reader
33 * @src: - string buffer passed to stream reader, optional
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +010034 * Return: - Unicode code point, or -1
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020035 */
36static int get_code(u8 (*read_u8)(void *data), void *data)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020037{
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020038 s32 ch = 0;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020039
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020040 ch = read_u8(data);
41 if (!ch)
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020042 return 0;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020043 if (ch >= 0xc2 && ch <= 0xf4) {
44 int code = 0;
45
46 if (ch >= 0xe0) {
47 if (ch >= 0xf0) {
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020048 /* 0xf0 - 0xf4 */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020049 ch &= 0x07;
50 code = ch << 18;
51 ch = read_u8(data);
52 if (ch < 0x80 || ch > 0xbf)
53 goto error;
54 ch &= 0x3f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020055 } else {
56 /* 0xe0 - 0xef */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020057 ch &= 0x0f;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020058 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020059 code += ch << 12;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020060 if ((code >= 0xD800 && code <= 0xDFFF) ||
61 code >= 0x110000)
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020062 goto error;
63 ch = read_u8(data);
64 if (ch < 0x80 || ch > 0xbf)
65 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020066 }
67 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020068 ch &= 0x3f;
69 code += ch << 6;
70 ch = read_u8(data);
71 if (ch < 0x80 || ch > 0xbf)
72 goto error;
73 ch &= 0x3f;
74 ch += code;
75 } else if (ch >= 0x80) {
76 goto error;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +020077 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020078 return ch;
79error:
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +010080 return -1;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +020081}
82
83/**
84 * read_string() - read byte from character string
85 *
86 * @data: - pointer to string
87 * Return: - byte read
88 *
89 * The string pointer is incremented if it does not point to '\0'.
90 */
91static u8 read_string(void *data)
92
93{
94 const char **src = (const char **)data;
95 u8 c;
96
97 if (!src || !*src || !**src)
98 return 0;
99 c = **src;
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200100 ++*src;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200101 return c;
102}
103
104/**
105 * read_console() - read byte from console
106 *
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200107 * @data - not used, needed to match interface
108 * Return: - byte read or 0 on error
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200109 */
110static u8 read_console(void *data)
111{
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200112 int ch;
113
Heinrich Schuchardtc670aee2020-10-07 18:11:48 +0200114 ch = getchar();
Heinrich Schuchardt60d79872018-10-02 06:43:38 +0200115 if (ch < 0)
116 ch = 0;
117 return ch;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200118}
119
120int console_read_unicode(s32 *code)
121{
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +0100122 for (;;) {
123 s32 c;
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200124
Heinrich Schuchardtddbaff52021-02-27 14:08:37 +0100125 if (!tstc()) {
126 /* No input available */
127 return 1;
128 }
129
130 /* Read Unicode code */
131 c = get_code(read_console, NULL);
132 if (c > 0) {
133 *code = c;
134 return 0;
135 }
136 }
Heinrich Schuchardt35cbb792018-09-12 00:05:32 +0200137}
138
139s32 utf8_get(const char **src)
140{
141 return get_code(read_string, src);
Heinrich Schuchardtd8c28232018-08-31 21:31:27 +0200142}
143
144int utf8_put(s32 code, char **dst)
145{
146 if (!dst || !*dst)
147 return -1;
148 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
149 return -1;
150 if (code <= 0x007F) {
151 **dst = code;
152 } else {
153 if (code <= 0x07FF) {
154 **dst = code >> 6 | 0xC0;
155 } else {
156 if (code < 0x10000) {
157 **dst = code >> 12 | 0xE0;
158 } else {
159 **dst = code >> 18 | 0xF0;
160 ++*dst;
161 **dst = (code >> 12 & 0x3F) | 0x80;
162 }
163 ++*dst;
164 **dst = (code >> 6 & 0x3F) | 0x80;
165 }
166 ++*dst;
167 **dst = (code & 0x3F) | 0x80;
168 }
169 ++*dst;
170 return 0;
171}
172
173size_t utf8_utf16_strnlen(const char *src, size_t count)
174{
175 size_t len = 0;
176
177 for (; *src && count; --count) {
178 s32 code = utf8_get(&src);
179
180 if (!code)
181 break;
182 if (code < 0) {
183 /* Reserve space for a replacement character */
184 len += 1;
185 } else if (code < 0x10000) {
186 len += 1;
187 } else {
188 len += 2;
189 }
190 }
191 return len;
192}
193
194int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
195{
196 if (!src || !dst || !*dst)
197 return -1;
198
199 for (; count && *src; --count) {
200 s32 code = utf8_get(&src);
201
202 if (code < 0)
203 code = '?';
204 utf16_put(code, dst);
205 }
206 **dst = 0;
207 return 0;
208}
209
210s32 utf16_get(const u16 **src)
211{
212 s32 code, code2;
213
214 if (!src || !*src)
215 return -1;
216 if (!**src)
217 return 0;
218 code = **src;
219 ++*src;
220 if (code >= 0xDC00 && code <= 0xDFFF)
221 return -1;
222 if (code >= 0xD800 && code <= 0xDBFF) {
223 if (!**src)
224 return -1;
225 code &= 0x3ff;
226 code <<= 10;
227 code += 0x10000;
228 code2 = **src;
229 ++*src;
230 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
231 return -1;
232 code2 &= 0x3ff;
233 code += code2;
234 }
235 return code;
236}
237
238int utf16_put(s32 code, u16 **dst)
239{
240 if (!dst || !*dst)
241 return -1;
242 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
243 return -1;
244 if (code < 0x10000) {
245 **dst = code;
246 } else {
247 code -= 0x10000;
248 **dst = code >> 10 | 0xD800;
249 ++*dst;
250 **dst = (code & 0x3ff) | 0xDC00;
251 }
252 ++*dst;
253 return 0;
254}
255
256size_t utf16_strnlen(const u16 *src, size_t count)
257{
258 size_t len = 0;
259
260 for (; *src && count; --count) {
261 s32 code = utf16_get(&src);
262
263 if (!code)
264 break;
265 /*
266 * In case of an illegal sequence still reserve space for a
267 * replacement character.
268 */
269 ++len;
270 }
271 return len;
272}
273
274size_t utf16_utf8_strnlen(const u16 *src, size_t count)
275{
276 size_t len = 0;
277
278 for (; *src && count; --count) {
279 s32 code = utf16_get(&src);
280
281 if (!code)
282 break;
283 if (code < 0)
284 /* Reserve space for a replacement character */
285 len += 1;
286 else if (code < 0x80)
287 len += 1;
288 else if (code < 0x800)
289 len += 2;
290 else if (code < 0x10000)
291 len += 3;
292 else
293 len += 4;
294 }
295 return len;
296}
297
298int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
299{
300 if (!src || !dst || !*dst)
301 return -1;
302
303 for (; count && *src; --count) {
304 s32 code = utf16_get(&src);
305
306 if (code < 0)
307 code = '?';
308 utf8_put(code, dst);
309 }
310 **dst = 0;
311 return 0;
312}
313
Heinrich Schuchardtb5130a82018-09-04 19:34:56 +0200314s32 utf_to_lower(const s32 code)
315{
316 struct capitalization_table *pos = capitalization_table;
317 s32 ret = code;
318
319 if (code <= 0x7f) {
320 if (code >= 'A' && code <= 'Z')
321 ret += 0x20;
322 return ret;
323 }
324 for (; pos->upper; ++pos) {
325 if (pos->upper == code) {
326 ret = pos->lower;
327 break;
328 }
329 }
330 return ret;
331}
332
333s32 utf_to_upper(const s32 code)
334{
335 struct capitalization_table *pos = capitalization_table;
336 s32 ret = code;
337
338 if (code <= 0x7f) {
339 if (code >= 'a' && code <= 'z')
340 ret -= 0x20;
341 return ret;
342 }
343 for (; pos->lower; ++pos) {
344 if (pos->lower == code) {
345 ret = pos->upper;
346 break;
347 }
348 }
349 return ret;
350}
Rob Clark78178bb2017-09-09 06:47:40 -0400351
AKASHI Takahirof8062c92019-09-18 10:26:29 +0900352/*
Heinrich Schuchardt7a9b3662022-12-29 14:44:03 +0100353 * u16_strcasecmp() - compare two u16 strings case insensitively
354 *
355 * @s1: first string to compare
356 * @s2: second string to compare
357 * @n: maximum number of u16 to compare
358 * Return: 0 if the first n u16 are the same in s1 and s2
359 * < 0 if the first different u16 in s1 is less than the
360 * corresponding u16 in s2
361 * > 0 if the first different u16 in s1 is greater than the
362 */
363int u16_strcasecmp(const u16 *s1, const u16 *s2)
364{
365 int ret = 0;
366 s32 c1, c2;
367
368 for (;;) {
369 c1 = utf_to_upper(utf16_get(&s1));
370 c2 = utf_to_upper(utf16_get(&s2));
371 ret = c1 - c2;
372 if (ret || !c1 || c1 == -1 || c2 == -1)
373 break;
374 }
375 return ret;
376}
377
378/*
AKASHI Takahirof8062c92019-09-18 10:26:29 +0900379 * u16_strncmp() - compare two u16 string
380 *
381 * @s1: first string to compare
382 * @s2: second string to compare
383 * @n: maximum number of u16 to compare
384 * Return: 0 if the first n u16 are the same in s1 and s2
385 * < 0 if the first different u16 in s1 is less than the
386 * corresponding u16 in s2
387 * > 0 if the first different u16 in s1 is greater than the
388 * corresponding u16 in s2
389 */
390int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
391{
392 int ret = 0;
393
394 for (; n; --n, ++s1, ++s2) {
395 ret = *s1 - *s2;
396 if (ret || !*s1)
397 break;
398 }
399
400 return ret;
401}
402
Ilias Apalodimas6974a4a2020-11-22 15:10:26 +0200403size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
Rob Clark78178bb2017-09-09 06:47:40 -0400404{
405 size_t i;
406 for (i = 0; count-- && in[i]; i++);
407 return i;
408}
409
Sughosh Ganu4835d352020-05-06 22:12:41 +0300410size_t u16_strsize(const void *in)
411{
412 return (u16_strlen(in) + 1) * sizeof(u16);
413}
414
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900415u16 *u16_strcpy(u16 *dest, const u16 *src)
416{
417 u16 *tmp = dest;
418
419 for (;; dest++, src++) {
420 *dest = *src;
421 if (!*src)
422 break;
423 }
424
425 return tmp;
426}
427
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200428u16 *u16_strdup(const void *src)
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900429{
430 u16 *new;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200431 size_t len;
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900432
433 if (!src)
434 return NULL;
Heinrich Schuchardt967407d2022-04-02 11:46:59 +0200435 len = u16_strsize(src);
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200436 new = malloc(len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900437 if (!new)
438 return NULL;
Heinrich Schuchardt317068b2019-07-14 17:28:49 +0200439 memcpy(new, src, len);
Akashi, Takahiro2a3537a2018-12-14 19:10:38 +0900440
441 return new;
442}
443
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900444size_t u16_strlcat(u16 *dest, const u16 *src, size_t count)
445{
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200446 size_t destlen = u16_strnlen(dest, count);
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900447 size_t srclen = u16_strlen(src);
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200448 size_t ret = destlen + srclen;
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900449
450 if (destlen >= count)
451 return ret;
Matthias Schiffer7c00b802023-07-14 13:24:51 +0200452 if (ret >= count)
453 srclen -= (ret - count + 1);
Masahisa Kojimaeca08ce2022-04-28 17:09:34 +0900454 memcpy(&dest[destlen], src, 2 * srclen);
455 dest[destlen + srclen] = 0x0000;
456
457 return ret;
458}
459
Rob Clark78178bb2017-09-09 06:47:40 -0400460/* Convert UTF-16 to UTF-8. */
461uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
462{
463 uint32_t code_high = 0;
464
465 while (size--) {
466 uint32_t code = *src++;
467
468 if (code_high) {
469 if (code >= 0xDC00 && code <= 0xDFFF) {
470 /* Surrogate pair. */
471 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
472
473 *dest++ = (code >> 18) | 0xF0;
474 *dest++ = ((code >> 12) & 0x3F) | 0x80;
475 *dest++ = ((code >> 6) & 0x3F) | 0x80;
476 *dest++ = (code & 0x3F) | 0x80;
477 } else {
478 /* Error... */
479 *dest++ = '?';
480 /* *src may be valid. Don't eat it. */
481 src--;
482 }
483
484 code_high = 0;
485 } else {
486 if (code <= 0x007F) {
487 *dest++ = code;
488 } else if (code <= 0x07FF) {
489 *dest++ = (code >> 6) | 0xC0;
490 *dest++ = (code & 0x3F) | 0x80;
491 } else if (code >= 0xD800 && code <= 0xDBFF) {
492 code_high = code;
493 continue;
494 } else if (code >= 0xDC00 && code <= 0xDFFF) {
495 /* Error... */
496 *dest++ = '?';
497 } else if (code < 0x10000) {
498 *dest++ = (code >> 12) | 0xE0;
499 *dest++ = ((code >> 6) & 0x3F) | 0x80;
500 *dest++ = (code & 0x3F) | 0x80;
501 } else {
502 *dest++ = (code >> 18) | 0xF0;
503 *dest++ = ((code >> 12) & 0x3F) | 0x80;
504 *dest++ = ((code >> 6) & 0x3F) | 0x80;
505 *dest++ = (code & 0x3F) | 0x80;
506 }
507 }
508 }
509
510 return dest;
511}
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +0100512
Heinrich Schuchardt73bb90c2021-02-27 14:08:36 +0100513int utf_to_cp(s32 *c, const u16 *codepage)
514{
515 if (*c >= 0x80) {
516 int j;
517
518 /* Look up codepage translation */
519 for (j = 0; j < 0x80; ++j) {
520 if (*c == codepage[j]) {
521 *c = j + 0x80;
522 return 0;
523 }
524 }
525 *c = '?';
526 return -ENOENT;
527 }
528 return 0;
529}
Heinrich Schuchardte91789e2021-02-27 14:08:38 +0100530
531int utf8_to_cp437_stream(u8 c, char *buffer)
532{
533 char *end;
534 const char *pos;
535 s32 s;
536 int ret;
537
538 for (;;) {
539 pos = buffer;
540 end = buffer + strlen(buffer);
541 *end++ = c;
542 *end = 0;
543 s = utf8_get(&pos);
544 if (s > 0) {
545 *buffer = 0;
546 ret = utf_to_cp(&s, codepage_437);
547 return s;
548 }
549 if (pos == end)
550 return 0;
551 *buffer = 0;
552 }
553}
554
555int utf8_to_utf32_stream(u8 c, char *buffer)
556{
557 char *end;
558 const char *pos;
559 s32 s;
560
561 for (;;) {
562 pos = buffer;
563 end = buffer + strlen(buffer);
564 *end++ = c;
565 *end = 0;
566 s = utf8_get(&pos);
567 if (s > 0) {
568 *buffer = 0;
569 return s;
570 }
571 if (pos == end)
572 return 0;
Heinrich Schuchardt3150da32024-01-18 18:54:50 +0100573 /*
574 * Appending the byte lead to an invalid UTF-8 byte sequence.
575 * Consider it as the start of a new code sequence.
576 */
Heinrich Schuchardte91789e2021-02-27 14:08:38 +0100577 *buffer = 0;
578 }
579}