Line data Source code
1 : // Copyright 2005-2008 Google Inc. All Rights Reserved.
2 : // Author: jrm@google.com (Jim Meehan)
3 :
4 : #include <google/protobuf/stubs/common.h>
5 :
6 : #include <google/protobuf/stubs/stringpiece.h>
7 :
8 : namespace google {
9 : namespace protobuf {
10 : namespace internal {
11 :
12 : // These four-byte entries compactly encode how many bytes 0..255 to delete
13 : // in making a string replacement, how many bytes to add 0..255, and the offset
14 : // 0..64k-1 of the replacement string in remap_string.
15 : struct RemapEntry {
16 : uint8 delete_bytes;
17 : uint8 add_bytes;
18 : uint16 bytes_offset;
19 : };
20 :
21 : // Exit type codes for state tables. All but the first get stuffed into
22 : // signed one-byte entries. The first is only generated by executable code.
23 : // To distinguish from next-state entries, these must be contiguous and
24 : // all <= kExitNone
25 : typedef enum {
26 : kExitDstSpaceFull = 239,
27 : kExitIllegalStructure, // 240
28 : kExitOK, // 241
29 : kExitReject, // ...
30 : kExitReplace1,
31 : kExitReplace2,
32 : kExitReplace3,
33 : kExitReplace21,
34 : kExitReplace31,
35 : kExitReplace32,
36 : kExitReplaceOffset1,
37 : kExitReplaceOffset2,
38 : kExitReplace1S0,
39 : kExitSpecial,
40 : kExitDoAgain,
41 : kExitRejectAlt,
42 : kExitNone // 255
43 : } ExitReason;
44 :
45 :
46 : // This struct represents one entire state table. The three initialized byte
47 : // areas are state_table, remap_base, and remap_string. state0 and state0_size
48 : // give the byte offset and length within state_table of the initial state --
49 : // table lookups are expected to start and end in this state, but for
50 : // truncated UTF-8 strings, may end in a different state. These allow a quick
51 : // test for that condition. entry_shift is 8 for tables subscripted by a full
52 : // byte value and 6 for space-optimized tables subscripted by only six
53 : // significant bits in UTF-8 continuation bytes.
54 : typedef struct {
55 : const uint32 state0;
56 : const uint32 state0_size;
57 : const uint32 total_size;
58 : const int max_expand;
59 : const int entry_shift;
60 : const int bytes_per_entry;
61 : const uint32 losub;
62 : const uint32 hiadd;
63 : const uint8* state_table;
64 : const RemapEntry* remap_base;
65 : const uint8* remap_string;
66 : const uint8* fast_state;
67 : } UTF8StateMachineObj;
68 :
69 : typedef UTF8StateMachineObj UTF8ScanObj;
70 :
71 : #define X__ (kExitIllegalStructure)
72 : #define RJ_ (kExitReject)
73 : #define S1_ (kExitReplace1)
74 : #define S2_ (kExitReplace2)
75 : #define S3_ (kExitReplace3)
76 : #define S21 (kExitReplace21)
77 : #define S31 (kExitReplace31)
78 : #define S32 (kExitReplace32)
79 : #define T1_ (kExitReplaceOffset1)
80 : #define T2_ (kExitReplaceOffset2)
81 : #define S11 (kExitReplace1S0)
82 : #define SP_ (kExitSpecial)
83 : #define D__ (kExitDoAgain)
84 : #define RJA (kExitRejectAlt)
85 :
86 : // Entire table has 9 state blocks of 256 entries each
87 : static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
88 : static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
89 : static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
90 : static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
91 : static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
92 : static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
93 : static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
94 : static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
95 :
96 : static const uint8 utf8acceptnonsurrogates[] = {
97 : // state[0] 0x000000 Byte 1
98 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 :
103 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 :
108 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
109 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
110 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
111 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
112 :
113 : X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
114 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 : 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
116 : 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
117 :
118 : // state[1] 0x000080 Byte 2 of 2
119 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
120 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
121 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
122 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
123 :
124 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
125 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
126 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
127 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
128 :
129 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 :
134 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
135 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
136 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
137 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
138 :
139 : // state[2] 0x000000 Byte 2 of 3
140 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
142 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
143 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
144 :
145 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
147 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
148 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149 :
150 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
151 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
152 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 :
155 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
158 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
159 :
160 : // state[3] 0x001000 Byte 2 of 3
161 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
162 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
163 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
164 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165 :
166 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
168 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
169 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
170 :
171 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 :
176 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
179 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
180 :
181 : // state[4] 0x000000 Byte 2 of 4
182 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
183 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
184 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
185 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186 :
187 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
189 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
190 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191 :
192 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
193 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
194 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
195 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
196 :
197 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
200 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
201 :
202 : // state[5] 0x040000 Byte 2 of 4
203 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
204 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
205 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
206 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207 :
208 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
210 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
211 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
212 :
213 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
214 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
216 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217 :
218 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
221 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
222 :
223 : // state[6] 0x100000 Byte 2 of 4
224 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
225 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
226 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
227 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228 :
229 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
231 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
232 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
233 :
234 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
235 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
237 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238 :
239 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
242 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
243 :
244 : // state[7] 0x00d000 Byte 2 of 3
245 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
246 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
247 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
248 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249 :
250 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
252 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
253 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
254 :
255 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
256 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
257 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
258 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
259 :
260 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
263 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
264 :
265 : // state[8] 0x00d800 Byte 3 of 3
266 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
268 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
269 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270 :
271 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
273 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
274 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
275 :
276 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
279 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
280 :
281 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
284 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
285 : };
286 :
287 : // Remap base[0] = (del, add, string_offset)
288 : static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
289 : {0, 0, 0} };
290 :
291 : // Remap string[0]
292 : static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
293 : 0 };
294 :
295 : static const unsigned char utf8acceptnonsurrogates_fast[256] = {
296 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
297 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
298 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
299 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
300 :
301 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305 :
306 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
307 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
308 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
309 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310 :
311 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
312 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
313 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
314 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
315 : };
316 :
317 : static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
318 : utf8acceptnonsurrogates_STATE0,
319 : utf8acceptnonsurrogates_STATE0_SIZE,
320 : utf8acceptnonsurrogates_TOTAL_SIZE,
321 : utf8acceptnonsurrogates_MAX_EXPAND_X4,
322 : utf8acceptnonsurrogates_SHIFT,
323 : utf8acceptnonsurrogates_BYTES,
324 : utf8acceptnonsurrogates_LOSUB,
325 : utf8acceptnonsurrogates_HIADD,
326 : utf8acceptnonsurrogates,
327 : utf8acceptnonsurrogates_remap_base,
328 : utf8acceptnonsurrogates_remap_string,
329 : utf8acceptnonsurrogates_fast
330 : };
331 :
332 :
333 : #undef X__
334 : #undef RJ_
335 : #undef S1_
336 : #undef S2_
337 : #undef S3_
338 : #undef S21
339 : #undef S31
340 : #undef S32
341 : #undef T1_
342 : #undef T2_
343 : #undef S11
344 : #undef SP_
345 : #undef D__
346 : #undef RJA
347 :
348 : // Return true if current Tbl pointer is within state0 range
349 : // Note that unsigned compare checks both ends of range simultaneously
350 : static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
351 0 : const uint8* Tbl0 = &st->state_table[st->state0];
352 0 : return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
353 : }
354 :
355 : // Scan a UTF-8 string based on state table.
356 : // Always scan complete UTF-8 characters
357 : // Set number of bytes scanned. Return reason for exiting
358 479138 : int UTF8GenericScan(const UTF8ScanObj* st,
359 : const char * str,
360 : int str_length,
361 : int* bytes_consumed) {
362 479138 : *bytes_consumed = 0;
363 479138 : if (str_length == 0) return kExitOK;
364 :
365 0 : int eshift = st->entry_shift;
366 0 : const uint8* isrc = reinterpret_cast<const uint8*>(str);
367 0 : const uint8* src = isrc;
368 0 : const uint8* srclimit = isrc + str_length;
369 0 : const uint8* srclimit8 = srclimit - 7;
370 0 : const uint8* Tbl_0 = &st->state_table[st->state0];
371 :
372 : DoAgain:
373 : // Do state-table scan
374 0 : int e = 0;
375 : uint8 c;
376 0 : const uint8* Tbl2 = &st->fast_state[0];
377 0 : const uint32 losub = st->losub;
378 0 : const uint32 hiadd = st->hiadd;
379 : // Check initial few bytes one at a time until 8-byte aligned
380 : //----------------------------
381 0 : while ((((uintptr_t)src & 0x07) != 0) &&
382 0 : (src < srclimit) &&
383 0 : Tbl2[src[0]] == 0) {
384 0 : src++;
385 : }
386 0 : if (((uintptr_t)src & 0x07) == 0) {
387 : // Do fast for groups of 8 identity bytes.
388 : // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
389 : // including slowing slightly on cr/lf/ht
390 : //----------------------------
391 0 : while (src < srclimit8) {
392 0 : uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
393 0 : uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
394 0 : src += 8;
395 : // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
396 0 : uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
397 0 : (s4567 - losub) | (s4567 + hiadd);
398 0 : if ((temp & 0x80808080) != 0) {
399 : // We typically end up here on cr/lf/ht; src was incremented
400 0 : int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
401 0 : (Tbl2[src[-6]] | Tbl2[src[-5]]);
402 0 : if (e0123 != 0) {
403 : src -= 8;
404 : break;
405 : } // Exit on Non-interchange
406 0 : e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
407 0 : (Tbl2[src[-2]] | Tbl2[src[-1]]);
408 0 : if (e0123 != 0) {
409 0 : src -= 4;
410 0 : break;
411 : } // Exit on Non-interchange
412 : // Else OK, go around again
413 : }
414 : }
415 : }
416 : //----------------------------
417 :
418 : // Byte-at-a-time scan
419 : //----------------------------
420 0 : const uint8* Tbl = Tbl_0;
421 0 : while (src < srclimit) {
422 0 : c = *src;
423 0 : e = Tbl[c];
424 0 : src++;
425 0 : if (e >= kExitIllegalStructure) {break;}
426 0 : Tbl = &Tbl_0[e << eshift];
427 : }
428 : //----------------------------
429 :
430 :
431 : // Exit posibilities:
432 : // Some exit code, !state0, back up over last char
433 : // Some exit code, state0, back up one byte exactly
434 : // source consumed, !state0, back up over partial char
435 : // source consumed, state0, exit OK
436 : // For illegal byte in state0, avoid backup up over PREVIOUS char
437 : // For truncated last char, back up to beginning of it
438 :
439 0 : if (e >= kExitIllegalStructure) {
440 : // Back up over exactly one byte of rejected/illegal UTF-8 character
441 0 : src--;
442 : // Back up more if needed
443 0 : if (!InStateZero(st, Tbl)) {
444 0 : do {
445 0 : src--;
446 0 : } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
447 : }
448 0 : } else if (!InStateZero(st, Tbl)) {
449 : // Back up over truncated UTF-8 character
450 : e = kExitIllegalStructure;
451 0 : do {
452 0 : src--;
453 0 : } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
454 : } else {
455 : // Normal termination, source fully consumed
456 : e = kExitOK;
457 : }
458 :
459 0 : if (e == kExitDoAgain) {
460 : // Loop back up to the fast scan
461 : goto DoAgain;
462 : }
463 :
464 0 : *bytes_consumed = src - isrc;
465 0 : return e;
466 : }
467 :
468 479236 : int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
469 : const char * str,
470 : int str_length,
471 : int* bytes_consumed) {
472 479236 : *bytes_consumed = 0;
473 479236 : if (str_length == 0) return kExitOK;
474 :
475 479168 : const uint8* isrc = reinterpret_cast<const uint8*>(str);
476 479168 : const uint8* src = isrc;
477 479168 : const uint8* srclimit = isrc + str_length;
478 479168 : const uint8* srclimit8 = srclimit - 7;
479 : int n;
480 : int rest_consumed;
481 : int exit_reason;
482 479141 : do {
483 : // Check initial few bytes one at a time until 8-byte aligned
484 479165 : while ((((uintptr_t)src & 0x07) != 0) &&
485 0 : (src < srclimit) && (src[0] < 0x80)) {
486 0 : src++;
487 : }
488 479165 : if (((uintptr_t)src & 0x07) == 0) {
489 31453774 : while ((src < srclimit8) &&
490 15487319 : (((reinterpret_cast<const uint32*>(src)[0] |
491 15487319 : reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
492 15487296 : src += 8;
493 : }
494 : }
495 2956197 : while ((src < srclimit) && (src[0] < 0x80)) {
496 2477032 : src++;
497 : }
498 : // Run state table on the rest
499 479165 : n = src - isrc;
500 479165 : exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
501 479141 : src += rest_consumed;
502 : } while ( exit_reason == kExitDoAgain );
503 :
504 479144 : *bytes_consumed = src - isrc;
505 479144 : return exit_reason;
506 : }
507 :
508 : // Hack: On some compilers the static tables are initialized at startup.
509 : // We can't use them until they are initialized. However, some Protocol
510 : // Buffer parsing happens at static init time and may try to validate
511 : // UTF-8 strings. Since UTF-8 validation is only used for debugging
512 : // anyway, we simply always return success if initialization hasn't
513 : // occurred yet.
514 : namespace {
515 :
516 : bool module_initialized_ = false;
517 :
518 : struct InitDetector {
519 : InitDetector() {
520 46 : module_initialized_ = true;
521 : }
522 : };
523 : InitDetector init_detector;
524 :
525 : } // namespace
526 :
527 479256 : bool IsStructurallyValidUTF8(const char* buf, int len) {
528 479256 : if (!module_initialized_) return true;
529 :
530 479259 : int bytes_consumed = 0;
531 : UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
532 479259 : buf, len, &bytes_consumed);
533 479141 : return (bytes_consumed == len);
534 : }
535 :
536 0 : int UTF8SpnStructurallyValid(const StringPiece& str) {
537 0 : if (!module_initialized_) return str.size();
538 :
539 0 : int bytes_consumed = 0;
540 : UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
541 0 : str.data(), str.size(), &bytes_consumed);
542 0 : return bytes_consumed;
543 : }
544 :
545 : // Coerce UTF-8 byte string in src_str to be
546 : // a structurally-valid equal-length string by selectively
547 : // overwriting illegal bytes with replace_char (typically blank).
548 : // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
549 : // src_str is read-only. If any overwriting is needed, a modified byte string
550 : // is created in idst, length isrclen.
551 : //
552 : // Returns pointer to output buffer, isrc if no changes were made,
553 : // or idst if some bytes were changed.
554 : //
555 : // Fast case: all is structurally valid and no byte copying is done.
556 : //
557 0 : char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
558 : char* idst,
559 : const char replace_char) {
560 0 : const char* isrc = src_str.data();
561 0 : const int len = src_str.length();
562 0 : int n = UTF8SpnStructurallyValid(src_str);
563 0 : if (n == len) { // Normal case -- all is cool, return
564 : return const_cast<char*>(isrc);
565 : } else { // Unusual case -- copy w/o bad bytes
566 0 : const char* src = isrc;
567 0 : const char* srclimit = isrc + len;
568 0 : char* dst = idst;
569 0 : memmove(dst, src, n); // Copy initial good chunk
570 0 : src += n;
571 0 : dst += n;
572 0 : while (src < srclimit) { // src points to bogus byte or is off the end
573 0 : dst[0] = replace_char; // replace one bad byte
574 0 : src++;
575 0 : dst++;
576 0 : StringPiece str2(src, srclimit - src);
577 0 : n = UTF8SpnStructurallyValid(str2); // scan the remainder
578 0 : memmove(dst, src, n); // copy next good chunk
579 0 : src += n;
580 0 : dst += n;
581 : }
582 : }
583 : return idst;
584 : }
585 :
586 : } // namespace internal
587 : } // namespace protobuf
588 92 : } // namespace google
|