blob: e1f7d3bd1d6e4d30aabaaaa3f4beb0cd0c5671cf [file] [log] [blame]
Li Jun67774832021-01-25 21:43:47 +08001#ifndef _LINUX_UTF_H
2#define _LINUX_UTF_H
3
4#include <asm/unaligned.h>
5
6static inline int utf8_to_utf16le(const char *s, __le16 *cp, unsigned len)
7{
8 int count = 0;
9 u8 c;
10 u16 uchar;
11
12 /*
13 * this insists on correct encodings, though not minimal ones.
14 * BUT it currently rejects legit 4-byte UTF-8 code points,
15 * which need surrogate pairs. (Unicode 3.1 can use them.)
16 */
17 while (len != 0 && (c = (u8) *s++) != 0) {
18 if ((c & 0x80)) {
19 /*
20 * 2-byte sequence:
21 * 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
22 */
23 if ((c & 0xe0) == 0xc0) {
24 uchar = (c & 0x1f) << 6;
25
26 c = (u8) *s++;
27 if ((c & 0xc0) != 0x80)
28 goto fail;
29 c &= 0x3f;
30 uchar |= c;
31
32 /*
33 * 3-byte sequence (most CJKV characters):
34 * zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
35 */
36 } else if ((c & 0xf0) == 0xe0) {
37 uchar = (c & 0x0f) << 12;
38
39 c = (u8) *s++;
40 if ((c & 0xc0) != 0x80)
41 goto fail;
42 c &= 0x3f;
43 uchar |= c << 6;
44
45 c = (u8) *s++;
46 if ((c & 0xc0) != 0x80)
47 goto fail;
48 c &= 0x3f;
49 uchar |= c;
50
51 /* no bogus surrogates */
52 if (0xd800 <= uchar && uchar <= 0xdfff)
53 goto fail;
54
55 /*
56 * 4-byte sequence (surrogate pairs, currently rare):
57 * 11101110wwwwzzzzyy + 110111yyyyxxxxxx
58 * = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
59 * (uuuuu = wwww + 1)
60 * FIXME accept the surrogate code points (only)
61 */
62 } else
63 goto fail;
64 } else
65 uchar = c;
66 put_unaligned_le16(uchar, cp++);
67 count++;
68 len--;
69 }
70 return count;
71fail:
72 return -1;
73}
74
75#endif /* _LINUX_UTF_H */