Line data Source code
1 : // Protocol Buffers - Google's data interchange format
2 : // Copyright 2008 Google Inc. All rights reserved.
3 : // https://developers.google.com/protocol-buffers/
4 : //
5 : // Redistribution and use in source and binary forms, with or without
6 : // modification, are permitted provided that the following conditions are
7 : // met:
8 : //
9 : // * Redistributions of source code must retain the above copyright
10 : // notice, this list of conditions and the following disclaimer.
11 : // * Redistributions in binary form must reproduce the above
12 : // copyright notice, this list of conditions and the following disclaimer
13 : // in the documentation and/or other materials provided with the
14 : // distribution.
15 : // * Neither the name of Google Inc. nor the names of its
16 : // contributors may be used to endorse or promote products derived from
17 : // this software without specific prior written permission.
18 : //
19 : // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 : // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 : // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 : // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 : // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 : // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 : // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 : // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 : // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 : // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 : // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 :
31 : // from google3/strings/strutil.cc
32 :
33 : #include <google/protobuf/stubs/strutil.h>
34 : #include <google/protobuf/stubs/mathlimits.h>
35 :
36 : #include <errno.h>
37 : #include <float.h> // FLT_DIG and DBL_DIG
38 : #include <limits>
39 : #include <limits.h>
40 : #include <stdio.h>
41 : #include <iterator>
42 :
43 : #include <google/protobuf/stubs/stl_util.h>
44 :
45 : #ifdef _WIN32
46 : // MSVC has only _snprintf, not snprintf.
47 : //
48 : // MinGW has both snprintf and _snprintf, but they appear to be different
49 : // functions. The former is buggy. When invoked like so:
50 : // char buffer[32];
51 : // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
52 : // it prints "1.23000e+10". This is plainly wrong: %g should never print
53 : // trailing zeros after the decimal point. For some reason this bug only
54 : // occurs with some input values, not all. In any case, _snprintf does the
55 : // right thing, so we use it.
56 : #define snprintf _snprintf
57 : #endif
58 :
59 : namespace google {
60 : namespace protobuf {
61 :
62 : // These are defined as macros on some platforms. #undef them so that we can
63 : // redefine them.
64 : #undef isxdigit
65 : #undef isprint
66 :
67 : // The definitions of these in ctype.h change based on locale. Since our
68 : // string manipulation is all in relation to the protocol buffer and C++
69 : // languages, we always want to use the C locale. So, we re-define these
70 : // exactly as we want them.
71 : inline bool isxdigit(char c) {
72 0 : return ('0' <= c && c <= '9') ||
73 0 : ('a' <= c && c <= 'f') ||
74 0 : ('A' <= c && c <= 'F');
75 : }
76 :
77 : inline bool isprint(char c) {
78 749599 : return c >= 0x20 && c <= 0x7E;
79 : }
80 :
81 : // ----------------------------------------------------------------------
82 : // StripString
83 : // Replaces any occurrence of the character 'remove' (or the characters
84 : // in 'remove') with the character 'replacewith'.
85 : // ----------------------------------------------------------------------
86 0 : void StripString(string* s, const char* remove, char replacewith) {
87 0 : const char * str_start = s->c_str();
88 0 : const char * str = str_start;
89 0 : for (str = strpbrk(str, remove);
90 : str != NULL;
91 0 : str = strpbrk(str + 1, remove)) {
92 0 : (*s)[str - str_start] = replacewith;
93 : }
94 0 : }
95 :
96 0 : void StripWhitespace(string* str) {
97 0 : int str_length = str->length();
98 :
99 : // Strip off leading whitespace.
100 0 : int first = 0;
101 0 : while (first < str_length && ascii_isspace(str->at(first))) {
102 0 : ++first;
103 : }
104 : // If entire string is white space.
105 0 : if (first == str_length) {
106 : str->clear();
107 0 : return;
108 : }
109 0 : if (first > 0) {
110 0 : str->erase(0, first);
111 0 : str_length -= first;
112 : }
113 :
114 : // Strip off trailing whitespace.
115 0 : int last = str_length - 1;
116 0 : while (last >= 0 && ascii_isspace(str->at(last))) {
117 0 : --last;
118 : }
119 0 : if (last != (str_length - 1) && last >= 0) {
120 0 : str->erase(last + 1, string::npos);
121 : }
122 : }
123 :
124 : // ----------------------------------------------------------------------
125 : // StringReplace()
126 : // Replace the "old" pattern with the "new" pattern in a string,
127 : // and append the result to "res". If replace_all is false,
128 : // it only replaces the first instance of "old."
129 : // ----------------------------------------------------------------------
130 :
131 26891 : void StringReplace(const string& s, const string& oldsub,
132 : const string& newsub, bool replace_all,
133 : string* res) {
134 26891 : if (oldsub.empty()) {
135 0 : res->append(s); // if empty, append the given string.
136 26891 : return;
137 : }
138 :
139 : string::size_type start_pos = 0;
140 : string::size_type pos;
141 5643 : do {
142 32534 : pos = s.find(oldsub, start_pos);
143 32534 : if (pos == string::npos) {
144 : break;
145 : }
146 5643 : res->append(s, start_pos, pos - start_pos);
147 5643 : res->append(newsub);
148 5643 : start_pos = pos + oldsub.size(); // start searching again after the "old"
149 : } while (replace_all);
150 26891 : res->append(s, start_pos, s.length() - start_pos);
151 : }
152 :
153 : // ----------------------------------------------------------------------
154 : // StringReplace()
155 : // Give me a string and two patterns "old" and "new", and I replace
156 : // the first instance of "old" in the string with "new", if it
157 : // exists. If "global" is true; call this repeatedly until it
158 : // fails. RETURN a new string, regardless of whether the replacement
159 : // happened or not.
160 : // ----------------------------------------------------------------------
161 :
162 26891 : string StringReplace(const string& s, const string& oldsub,
163 : const string& newsub, bool replace_all) {
164 : string ret;
165 26891 : StringReplace(s, oldsub, newsub, replace_all, &ret);
166 26891 : return ret;
167 : }
168 :
169 : // ----------------------------------------------------------------------
170 : // SplitStringUsing()
171 : // Split a string using a character delimiter. Append the components
172 : // to 'result'.
173 : //
174 : // Note: For multi-character delimiters, this routine will split on *ANY* of
175 : // the characters in the string, not the entire string as a single delimiter.
176 : // ----------------------------------------------------------------------
177 : template <typename ITR>
178 : static inline
179 366 : void SplitStringToIteratorUsing(const string& full,
180 : const char* delim,
181 : ITR& result) {
182 : // Optimize the common case where delim is a single character.
183 366 : if (delim[0] != '\0' && delim[1] == '\0') {
184 366 : char c = delim[0];
185 366 : const char* p = full.data();
186 366 : const char* end = p + full.size();
187 2487 : while (p != end) {
188 1755 : if (*p == c) {
189 720 : ++p;
190 : } else {
191 : const char* start = p;
192 10233 : while (++p != end && *p != c);
193 4140 : *result++ = string(start, p - start);
194 : }
195 : }
196 : return;
197 : }
198 :
199 : string::size_type begin_index, end_index;
200 0 : begin_index = full.find_first_not_of(delim);
201 0 : while (begin_index != string::npos) {
202 0 : end_index = full.find_first_of(delim, begin_index);
203 0 : if (end_index == string::npos) {
204 0 : *result++ = full.substr(begin_index);
205 0 : return;
206 : }
207 0 : *result++ = full.substr(begin_index, (end_index - begin_index));
208 0 : begin_index = full.find_first_not_of(delim, end_index);
209 : }
210 : }
211 :
212 366 : void SplitStringUsing(const string& full,
213 : const char* delim,
214 : vector<string>* result) {
215 : back_insert_iterator< vector<string> > it(*result);
216 366 : SplitStringToIteratorUsing(full, delim, it);
217 366 : }
218 :
219 : // Split a string using a character delimiter. Append the components
220 : // to 'result'. If there are consecutive delimiters, this function
221 : // will return corresponding empty strings. The string is split into
222 : // at most the specified number of pieces greedily. This means that the
223 : // last piece may possibly be split further. To split into as many pieces
224 : // as possible, specify 0 as the number of pieces.
225 : //
226 : // If "full" is the empty string, yields an empty string as the only value.
227 : //
228 : // If "pieces" is negative for some reason, it returns the whole string
229 : // ----------------------------------------------------------------------
230 : template <typename StringType, typename ITR>
231 : static inline
232 0 : void SplitStringToIteratorAllowEmpty(const StringType& full,
233 : const char* delim,
234 : int pieces,
235 : ITR& result) {
236 : string::size_type begin_index, end_index;
237 0 : begin_index = 0;
238 :
239 0 : for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
240 0 : end_index = full.find_first_of(delim, begin_index);
241 0 : if (end_index == string::npos) {
242 0 : *result++ = full.substr(begin_index);
243 0 : return;
244 : }
245 0 : *result++ = full.substr(begin_index, (end_index - begin_index));
246 0 : begin_index = end_index + 1;
247 : }
248 0 : *result++ = full.substr(begin_index);
249 : }
250 :
251 0 : void SplitStringAllowEmpty(const string& full, const char* delim,
252 : vector<string>* result) {
253 : back_insert_iterator<vector<string> > it(*result);
254 0 : SplitStringToIteratorAllowEmpty(full, delim, 0, it);
255 0 : }
256 :
257 : // ----------------------------------------------------------------------
258 : // JoinStrings()
259 : // This merges a vector of string components with delim inserted
260 : // as separaters between components.
261 : //
262 : // ----------------------------------------------------------------------
263 : template <class ITERATOR>
264 0 : static void JoinStringsIterator(const ITERATOR& start,
265 : const ITERATOR& end,
266 : const char* delim,
267 : string* result) {
268 0 : GOOGLE_CHECK(result != NULL);
269 : result->clear();
270 0 : int delim_length = strlen(delim);
271 :
272 : // Precompute resulting length so we can reserve() memory in one shot.
273 0 : int length = 0;
274 0 : for (ITERATOR iter = start; iter != end; ++iter) {
275 0 : if (iter != start) {
276 0 : length += delim_length;
277 : }
278 0 : length += iter->size();
279 : }
280 0 : result->reserve(length);
281 :
282 : // Now combine everything.
283 0 : for (ITERATOR iter = start; iter != end; ++iter) {
284 0 : if (iter != start) {
285 0 : result->append(delim, delim_length);
286 : }
287 0 : result->append(iter->data(), iter->size());
288 : }
289 0 : }
290 :
291 0 : void JoinStrings(const vector<string>& components,
292 : const char* delim,
293 : string * result) {
294 0 : JoinStringsIterator(components.begin(), components.end(), delim, result);
295 0 : }
296 :
297 : // ----------------------------------------------------------------------
298 : // UnescapeCEscapeSequences()
299 : // This does all the unescaping that C does: \ooo, \r, \n, etc
300 : // Returns length of resulting string.
301 : // The implementation of \x parses any positive number of hex digits,
302 : // but it is an error if the value requires more than 8 bits, and the
303 : // result is truncated to 8 bits.
304 : //
305 : // The second call stores its errors in a supplied string vector.
306 : // If the string vector pointer is NULL, it reports the errors with LOG().
307 : // ----------------------------------------------------------------------
308 :
309 : #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
310 :
311 : // Protocol buffers doesn't ever care about errors, but I don't want to remove
312 : // the code.
313 : #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
314 :
315 0 : int UnescapeCEscapeSequences(const char* source, char* dest) {
316 0 : return UnescapeCEscapeSequences(source, dest, NULL);
317 : }
318 :
319 8 : int UnescapeCEscapeSequences(const char* source, char* dest,
320 : vector<string> *errors) {
321 : GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
322 :
323 8 : char* d = dest;
324 8 : const char* p = source;
325 :
326 : // Small optimization for case where source = dest and there's no escaping
327 16 : while ( p == d && *p != '\0' && *p != '\\' )
328 0 : p++, d++;
329 :
330 57 : while (*p != '\0') {
331 49 : if (*p != '\\') {
332 35 : *d++ = *p++;
333 : } else {
334 14 : switch ( *++p ) { // skip past the '\\'
335 : case '\0':
336 : LOG_STRING(ERROR, errors) << "String cannot end with \\";
337 0 : *d = '\0';
338 0 : return d - dest; // we're done with p
339 0 : case 'a': *d++ = '\a'; break;
340 0 : case 'b': *d++ = '\b'; break;
341 0 : case 'f': *d++ = '\f'; break;
342 1 : case 'n': *d++ = '\n'; break;
343 1 : case 'r': *d++ = '\r'; break;
344 1 : case 't': *d++ = '\t'; break;
345 0 : case 'v': *d++ = '\v'; break;
346 1 : case '\\': *d++ = '\\'; break;
347 0 : case '?': *d++ = '\?'; break; // \? Who knew?
348 1 : case '\'': *d++ = '\''; break;
349 1 : case '"': *d++ = '\"'; break;
350 : case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
351 : case '4': case '5': case '6': case '7': {
352 8 : char ch = *p - '0';
353 8 : if ( IS_OCTAL_DIGIT(p[1]) )
354 8 : ch = ch * 8 + *++p - '0';
355 8 : if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
356 8 : ch = ch * 8 + *++p - '0'; // now points at last digit
357 8 : *d++ = ch;
358 8 : break;
359 : }
360 : case 'x': case 'X': {
361 0 : if (!isxdigit(p[1])) {
362 : if (p[1] == '\0') {
363 : LOG_STRING(ERROR, errors) << "String cannot end with \\x";
364 : } else {
365 : LOG_STRING(ERROR, errors) <<
366 : "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
367 : }
368 : break;
369 : }
370 : unsigned int ch = 0;
371 : const char *hex_start = p;
372 0 : while (isxdigit(p[1])) // arbitrarily many hex digits
373 0 : ch = (ch << 4) + hex_digit_to_int(*++p);
374 : if (ch > 0xFF)
375 : LOG_STRING(ERROR, errors) << "Value of " <<
376 : "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
377 0 : *d++ = ch;
378 0 : break;
379 : }
380 : #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
381 : case 'u': {
382 : // \uhhhh => convert 4 hex digits to UTF-8
383 : char32 rune = 0;
384 : const char *hex_start = p;
385 : for (int i = 0; i < 4; ++i) {
386 : if (isxdigit(p[1])) { // Look one char ahead.
387 : rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
388 : } else {
389 : LOG_STRING(ERROR, errors)
390 : << "\\u must be followed by 4 hex digits: \\"
391 : << string(hex_start, p+1-hex_start);
392 : break;
393 : }
394 : }
395 : d += runetochar(d, &rune);
396 : break;
397 : }
398 : case 'U': {
399 : // \Uhhhhhhhh => convert 8 hex digits to UTF-8
400 : char32 rune = 0;
401 : const char *hex_start = p;
402 : for (int i = 0; i < 8; ++i) {
403 : if (isxdigit(p[1])) { // Look one char ahead.
404 : // Don't change rune until we're sure this
405 : // is within the Unicode limit, but do advance p.
406 : char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
407 : if (newrune > 0x10FFFF) {
408 : LOG_STRING(ERROR, errors)
409 : << "Value of \\"
410 : << string(hex_start, p + 1 - hex_start)
411 : << " exceeds Unicode limit (0x10FFFF)";
412 : break;
413 : } else {
414 : rune = newrune;
415 : }
416 : } else {
417 : LOG_STRING(ERROR, errors)
418 : << "\\U must be followed by 8 hex digits: \\"
419 : << string(hex_start, p+1-hex_start);
420 : break;
421 : }
422 : }
423 : d += runetochar(d, &rune);
424 : break;
425 : }
426 : #endif
427 : default:
428 : LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
429 : }
430 14 : p++; // read past letter we escaped
431 : }
432 : }
433 8 : *d = '\0';
434 8 : return d - dest;
435 : }
436 :
437 : // ----------------------------------------------------------------------
438 : // UnescapeCEscapeString()
439 : // This does the same thing as UnescapeCEscapeSequences, but creates
440 : // a new string. The caller does not need to worry about allocating
441 : // a dest buffer. This should be used for non performance critical
442 : // tasks such as printing debug messages. It is safe for src and dest
443 : // to be the same.
444 : //
445 : // The second call stores its errors in a supplied string vector.
446 : // If the string vector pointer is NULL, it reports the errors with LOG().
447 : //
448 : // In the first and second calls, the length of dest is returned. In the
449 : // the third call, the new string is returned.
450 : // ----------------------------------------------------------------------
451 0 : int UnescapeCEscapeString(const string& src, string* dest) {
452 0 : return UnescapeCEscapeString(src, dest, NULL);
453 : }
454 :
455 0 : int UnescapeCEscapeString(const string& src, string* dest,
456 : vector<string> *errors) {
457 0 : scoped_array<char> unescaped(new char[src.size() + 1]);
458 0 : int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
459 0 : GOOGLE_CHECK(dest);
460 0 : dest->assign(unescaped.get(), len);
461 0 : return len;
462 : }
463 :
464 8 : string UnescapeCEscapeString(const string& src) {
465 8 : scoped_array<char> unescaped(new char[src.size() + 1]);
466 16 : int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
467 16 : return string(unescaped.get(), len);
468 : }
469 :
470 : // ----------------------------------------------------------------------
471 : // CEscapeString()
472 : // CHexEscapeString()
473 : // Copies 'src' to 'dest', escaping dangerous characters using
474 : // C-style escape sequences. This is very useful for preparing query
475 : // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
476 : // hexadecimal rather than octal sequences.
477 : // Returns the number of bytes written to 'dest' (not including the \0)
478 : // or -1 if there was insufficient space.
479 : //
480 : // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
481 : // ----------------------------------------------------------------------
482 12080 : int CEscapeInternal(const char* src, int src_len, char* dest,
483 : int dest_len, bool use_hex, bool utf8_safe) {
484 12080 : const char* src_end = src + src_len;
485 12080 : int used = 0;
486 12080 : bool last_hex_escape = false; // true if last output char was \xNN
487 :
488 766941 : for (; src < src_end; src++) {
489 754861 : if (dest_len - used < 2) // Need space for two letter escape
490 : return -1;
491 :
492 754861 : bool is_hex_escape = false;
493 754861 : switch (*src) {
494 3240 : case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
495 288 : case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
496 1342 : case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
497 283 : case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
498 74 : case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
499 35 : case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
500 : default:
501 : // Note that if we emit \xNN and the src character after that is a hex
502 : // digit then that digit must be escaped too to prevent it being
503 : // interpreted as part of the character code by C.
504 2248797 : if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
505 730181 : (!isprint(*src) ||
506 0 : (last_hex_escape && isxdigit(*src)))) {
507 19418 : if (dest_len - used < 4) // need space for 4 letter escape
508 : return -1;
509 : sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
510 19418 : static_cast<uint8>(*src));
511 19418 : is_hex_escape = use_hex;
512 19418 : used += 4;
513 : } else {
514 730181 : dest[used++] = *src; break;
515 : }
516 : }
517 754861 : last_hex_escape = is_hex_escape;
518 : }
519 :
520 12080 : if (dest_len - used < 1) // make sure that there is room for \0
521 : return -1;
522 :
523 12080 : dest[used] = '\0'; // doesn't count towards return value though
524 12080 : return used;
525 : }
526 :
527 0 : int CEscapeString(const char* src, int src_len, char* dest, int dest_len) {
528 0 : return CEscapeInternal(src, src_len, dest, dest_len, false, false);
529 : }
530 :
531 : // ----------------------------------------------------------------------
532 : // CEscape()
533 : // CHexEscape()
534 : // Copies 'src' to result, escaping dangerous characters using
535 : // C-style escape sequences. This is very useful for preparing query
536 : // flags. 'src' and 'dest' should not overlap. The 'Hex' version
537 : // hexadecimal rather than octal sequences.
538 : //
539 : // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
540 : // ----------------------------------------------------------------------
541 12080 : string CEscape(const string& src) {
542 12080 : const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
543 12080 : scoped_array<char> dest(new char[dest_length]);
544 : const int len = CEscapeInternal(src.data(), src.size(),
545 24160 : dest.get(), dest_length, false, false);
546 : GOOGLE_DCHECK_GE(len, 0);
547 24160 : return string(dest.get(), len);
548 : }
549 :
550 : namespace strings {
551 :
552 0 : string Utf8SafeCEscape(const string& src) {
553 0 : const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
554 0 : scoped_array<char> dest(new char[dest_length]);
555 : const int len = CEscapeInternal(src.data(), src.size(),
556 0 : dest.get(), dest_length, false, true);
557 : GOOGLE_DCHECK_GE(len, 0);
558 0 : return string(dest.get(), len);
559 : }
560 :
561 0 : string CHexEscape(const string& src) {
562 0 : const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
563 0 : scoped_array<char> dest(new char[dest_length]);
564 : const int len = CEscapeInternal(src.data(), src.size(),
565 0 : dest.get(), dest_length, true, false);
566 : GOOGLE_DCHECK_GE(len, 0);
567 0 : return string(dest.get(), len);
568 : }
569 :
570 : } // namespace strings
571 :
572 : // ----------------------------------------------------------------------
573 : // strto32_adaptor()
574 : // strtou32_adaptor()
575 : // Implementation of strto[u]l replacements that have identical
576 : // overflow and underflow characteristics for both ILP-32 and LP-64
577 : // platforms, including errno preservation in error-free calls.
578 : // ----------------------------------------------------------------------
579 :
580 0 : int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
581 0 : const int saved_errno = errno;
582 0 : errno = 0;
583 0 : const long result = strtol(nptr, endptr, base);
584 0 : if (errno == ERANGE && result == LONG_MIN) {
585 : return kint32min;
586 0 : } else if (errno == ERANGE && result == LONG_MAX) {
587 : return kint32max;
588 0 : } else if (errno == 0 && result < kint32min) {
589 0 : errno = ERANGE;
590 0 : return kint32min;
591 0 : } else if (errno == 0 && result > kint32max) {
592 0 : errno = ERANGE;
593 0 : return kint32max;
594 : }
595 0 : if (errno == 0)
596 0 : errno = saved_errno;
597 0 : return static_cast<int32>(result);
598 : }
599 :
600 0 : uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
601 0 : const int saved_errno = errno;
602 0 : errno = 0;
603 0 : const unsigned long result = strtoul(nptr, endptr, base);
604 0 : if (errno == ERANGE && result == ULONG_MAX) {
605 : return kuint32max;
606 0 : } else if (errno == 0 && result > kuint32max) {
607 0 : errno = ERANGE;
608 0 : return kuint32max;
609 : }
610 0 : if (errno == 0)
611 0 : errno = saved_errno;
612 0 : return static_cast<uint32>(result);
613 : }
614 :
615 0 : inline bool safe_parse_sign(string* text /*inout*/,
616 : bool* negative_ptr /*output*/) {
617 0 : const char* start = text->data();
618 0 : const char* end = start + text->size();
619 :
620 : // Consume whitespace.
621 0 : while (start < end && (start[0] == ' ')) {
622 0 : ++start;
623 : }
624 0 : while (start < end && (end[-1] == ' ')) {
625 0 : --end;
626 : }
627 0 : if (start >= end) {
628 : return false;
629 : }
630 :
631 : // Consume sign.
632 0 : *negative_ptr = (start[0] == '-');
633 0 : if (*negative_ptr || start[0] == '+') {
634 0 : ++start;
635 0 : if (start >= end) {
636 : return false;
637 : }
638 : }
639 0 : *text = text->substr(start - text->data(), end - start);
640 0 : return true;
641 : }
642 :
643 : template<typename IntType>
644 0 : bool safe_parse_positive_int(
645 : string text, IntType* value_p) {
646 0 : int base = 10;
647 0 : IntType value = 0;
648 0 : const IntType vmax = std::numeric_limits<IntType>::max();
649 : assert(vmax > 0);
650 : assert(vmax >= base);
651 0 : const IntType vmax_over_base = vmax / base;
652 0 : const char* start = text.data();
653 0 : const char* end = start + text.size();
654 : // loop over digits
655 0 : for (; start < end; ++start) {
656 0 : unsigned char c = static_cast<unsigned char>(start[0]);
657 0 : int digit = c - '0';
658 0 : if (digit >= base || digit < 0) {
659 0 : *value_p = value;
660 0 : return false;
661 : }
662 0 : if (value > vmax_over_base) {
663 0 : *value_p = vmax;
664 0 : return false;
665 : }
666 0 : value *= base;
667 0 : if (value > vmax - digit) {
668 0 : *value_p = vmax;
669 0 : return false;
670 : }
671 0 : value += digit;
672 : }
673 0 : *value_p = value;
674 0 : return true;
675 : }
676 :
677 : template<typename IntType>
678 0 : bool safe_parse_negative_int(
679 : const string& text, IntType* value_p) {
680 0 : int base = 10;
681 0 : IntType value = 0;
682 0 : const IntType vmin = std::numeric_limits<IntType>::min();
683 : assert(vmin < 0);
684 : assert(vmin <= 0 - base);
685 0 : IntType vmin_over_base = vmin / base;
686 : // 2003 c++ standard [expr.mul]
687 : // "... the sign of the remainder is implementation-defined."
688 : // Although (vmin/base)*base + vmin%base is always vmin.
689 : // 2011 c++ standard tightens the spec but we cannot rely on it.
690 : if (vmin % base > 0) {
691 : vmin_over_base += 1;
692 : }
693 0 : const char* start = text.data();
694 0 : const char* end = start + text.size();
695 : // loop over digits
696 0 : for (; start < end; ++start) {
697 0 : unsigned char c = static_cast<unsigned char>(start[0]);
698 0 : int digit = c - '0';
699 0 : if (digit >= base || digit < 0) {
700 0 : *value_p = value;
701 0 : return false;
702 : }
703 0 : if (value < vmin_over_base) {
704 0 : *value_p = vmin;
705 0 : return false;
706 : }
707 0 : value *= base;
708 0 : if (value < vmin + digit) {
709 0 : *value_p = vmin;
710 0 : return false;
711 : }
712 0 : value -= digit;
713 : }
714 0 : *value_p = value;
715 0 : return true;
716 : }
717 :
718 : template<typename IntType>
719 0 : bool safe_int_internal(string text, IntType* value_p) {
720 0 : *value_p = 0;
721 : bool negative;
722 0 : if (!safe_parse_sign(&text, &negative)) {
723 : return false;
724 : }
725 0 : if (!negative) {
726 0 : return safe_parse_positive_int(text, value_p);
727 : } else {
728 0 : return safe_parse_negative_int(text, value_p);
729 : }
730 : }
731 :
732 : template<typename IntType>
733 0 : bool safe_uint_internal(string text, IntType* value_p) {
734 0 : *value_p = 0;
735 : bool negative;
736 0 : if (!safe_parse_sign(&text, &negative) || negative) {
737 : return false;
738 : }
739 0 : return safe_parse_positive_int(text, value_p);
740 : }
741 :
742 : // ----------------------------------------------------------------------
743 : // FastIntToBuffer()
744 : // FastInt64ToBuffer()
745 : // FastHexToBuffer()
746 : // FastHex64ToBuffer()
747 : // FastHex32ToBuffer()
748 : // ----------------------------------------------------------------------
749 :
750 : // Offset into buffer where FastInt64ToBuffer places the end of string
751 : // null character. Also used by FastInt64ToBufferLeft.
752 : static const int kFastInt64ToBufferOffset = 21;
753 :
754 217 : char *FastInt64ToBuffer(int64 i, char* buffer) {
755 : // We could collapse the positive and negative sections, but that
756 : // would be slightly slower for positive numbers...
757 : // 22 bytes is enough to store -2**64, -18446744073709551616.
758 217 : char* p = buffer + kFastInt64ToBufferOffset;
759 217 : *p-- = '\0';
760 217 : if (i >= 0) {
761 228 : do {
762 228 : *p-- = '0' + i % 10;
763 228 : i /= 10;
764 : } while (i > 0);
765 : return p + 1;
766 : } else {
767 : // On different platforms, % and / have different behaviors for
768 : // negative numbers, so we need to jump through hoops to make sure
769 : // we don't divide negative numbers.
770 45 : if (i > -10) {
771 0 : i = -i;
772 0 : *p-- = '0' + i;
773 0 : *p = '-';
774 0 : return p;
775 : } else {
776 : // Make sure we aren't at MIN_INT, in which case we can't say i = -i
777 45 : i = i + 10;
778 45 : i = -i;
779 45 : *p-- = '0' + i % 10;
780 : // Undo what we did a moment ago
781 45 : i = i / 10 + 1;
782 334 : do {
783 334 : *p-- = '0' + i % 10;
784 334 : i /= 10;
785 : } while (i > 0);
786 45 : *p = '-';
787 45 : return p;
788 : }
789 : }
790 : }
791 :
792 : // Offset into buffer where FastInt32ToBuffer places the end of string
793 : // null character. Also used by FastInt32ToBufferLeft
794 : static const int kFastInt32ToBufferOffset = 11;
795 :
796 : // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
797 : // compiler to generate 32 bit arithmetic instructions. It's much faster, at
798 : // least with 32 bit binaries.
799 48709 : char *FastInt32ToBuffer(int32 i, char* buffer) {
800 : // We could collapse the positive and negative sections, but that
801 : // would be slightly slower for positive numbers...
802 : // 12 bytes is enough to store -2**32, -4294967296.
803 48709 : char* p = buffer + kFastInt32ToBufferOffset;
804 48709 : *p-- = '\0';
805 48709 : if (i >= 0) {
806 85205 : do {
807 85205 : *p-- = '0' + i % 10;
808 85205 : i /= 10;
809 : } while (i > 0);
810 : return p + 1;
811 : } else {
812 : // On different platforms, % and / have different behaviors for
813 : // negative numbers, so we need to jump through hoops to make sure
814 : // we don't divide negative numbers.
815 57 : if (i > -10) {
816 6 : i = -i;
817 6 : *p-- = '0' + i;
818 6 : *p = '-';
819 6 : return p;
820 : } else {
821 : // Make sure we aren't at MIN_INT, in which case we can't say i = -i
822 51 : i = i + 10;
823 51 : i = -i;
824 51 : *p-- = '0' + i % 10;
825 : // Undo what we did a moment ago
826 51 : i = i / 10 + 1;
827 193 : do {
828 193 : *p-- = '0' + i % 10;
829 193 : i /= 10;
830 : } while (i > 0);
831 51 : *p = '-';
832 51 : return p;
833 : }
834 : }
835 : }
836 :
837 0 : char *FastHexToBuffer(int i, char* buffer) {
838 0 : GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
839 :
840 : static const char *hexdigits = "0123456789abcdef";
841 0 : char *p = buffer + 21;
842 0 : *p-- = '\0';
843 0 : do {
844 0 : *p-- = hexdigits[i & 15]; // mod by 16
845 0 : i >>= 4; // divide by 16
846 : } while (i > 0);
847 0 : return p + 1;
848 : }
849 :
850 0 : char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
851 : static const char *hexdigits = "0123456789abcdef";
852 0 : buffer[num_byte] = '\0';
853 0 : for (int i = num_byte - 1; i >= 0; i--) {
854 : #ifdef _M_X64
855 : // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
856 : // block. Given that the uint32 cast was to improve performance on 32-bit
857 : // platforms, we use 64-bit '&' directly.
858 : buffer[i] = hexdigits[value & 0xf];
859 : #else
860 0 : buffer[i] = hexdigits[uint32(value) & 0xf];
861 : #endif
862 0 : value >>= 4;
863 : }
864 0 : return buffer;
865 : }
866 :
867 0 : char *FastHex64ToBuffer(uint64 value, char* buffer) {
868 0 : return InternalFastHexToBuffer(value, buffer, 16);
869 : }
870 :
871 0 : char *FastHex32ToBuffer(uint32 value, char* buffer) {
872 0 : return InternalFastHexToBuffer(value, buffer, 8);
873 : }
874 :
875 : // ----------------------------------------------------------------------
876 : // FastInt32ToBufferLeft()
877 : // FastUInt32ToBufferLeft()
878 : // FastInt64ToBufferLeft()
879 : // FastUInt64ToBufferLeft()
880 : //
881 : // Like the Fast*ToBuffer() functions above, these are intended for speed.
882 : // Unlike the Fast*ToBuffer() functions, however, these functions write
883 : // their output to the beginning of the buffer (hence the name, as the
884 : // output is left-aligned). The caller is responsible for ensuring that
885 : // the buffer has enough space to hold the output.
886 : //
887 : // Returns a pointer to the end of the string (i.e. the null character
888 : // terminating the string).
889 : // ----------------------------------------------------------------------
890 :
891 : static const char two_ASCII_digits[100][2] = {
892 : {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
893 : {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
894 : {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
895 : {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
896 : {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
897 : {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
898 : {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
899 : {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
900 : {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
901 : {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
902 : {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
903 : {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
904 : {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
905 : {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
906 : {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
907 : {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
908 : {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
909 : {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
910 : {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
911 : {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
912 : };
913 :
914 6062 : char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
915 : int digits;
916 6062 : const char *ASCII_digits = NULL;
917 : // The idea of this implementation is to trim the number of divides to as few
918 : // as possible by using multiplication and subtraction rather than mod (%),
919 : // and by outputting two digits at a time rather than one.
920 : // The huge-number case is first, in the hopes that the compiler will output
921 : // that case in one branch-free block of code, and only output conditional
922 : // branches into it from below.
923 6062 : if (u >= 1000000000) { // >= 1,000,000,000
924 26 : digits = u / 100000000; // 100,000,000
925 26 : ASCII_digits = two_ASCII_digits[digits];
926 26 : buffer[0] = ASCII_digits[0];
927 26 : buffer[1] = ASCII_digits[1];
928 26 : buffer += 2;
929 : sublt100_000_000:
930 28 : u -= digits * 100000000; // 100,000,000
931 : lt100_000_000:
932 55 : digits = u / 1000000; // 1,000,000
933 55 : ASCII_digits = two_ASCII_digits[digits];
934 55 : buffer[0] = ASCII_digits[0];
935 55 : buffer[1] = ASCII_digits[1];
936 55 : buffer += 2;
937 : sublt1_000_000:
938 66 : u -= digits * 1000000; // 1,000,000
939 : lt1_000_000:
940 68 : digits = u / 10000; // 10,000
941 68 : ASCII_digits = two_ASCII_digits[digits];
942 68 : buffer[0] = ASCII_digits[0];
943 68 : buffer[1] = ASCII_digits[1];
944 68 : buffer += 2;
945 : sublt10_000:
946 169 : u -= digits * 10000; // 10,000
947 : lt10_000:
948 276 : digits = u / 100;
949 276 : ASCII_digits = two_ASCII_digits[digits];
950 276 : buffer[0] = ASCII_digits[0];
951 276 : buffer[1] = ASCII_digits[1];
952 276 : buffer += 2;
953 : sublt100:
954 3259 : u -= digits * 100;
955 : lt100:
956 5074 : digits = u;
957 5074 : ASCII_digits = two_ASCII_digits[digits];
958 5074 : buffer[0] = ASCII_digits[0];
959 5074 : buffer[1] = ASCII_digits[1];
960 5074 : buffer += 2;
961 : done:
962 6062 : *buffer = 0;
963 6062 : return buffer;
964 : }
965 :
966 6036 : if (u < 100) {
967 2803 : digits = u;
968 2803 : if (u >= 10) goto lt100;
969 988 : *buffer++ = '0' + digits;
970 988 : goto done;
971 : }
972 3233 : if (u < 10000) { // 10,000
973 3090 : if (u >= 1000) goto lt10_000;
974 2983 : digits = u / 100;
975 2983 : *buffer++ = '0' + digits;
976 2983 : goto sublt100;
977 : }
978 143 : if (u < 1000000) { // 1,000,000
979 103 : if (u >= 100000) goto lt1_000_000;
980 101 : digits = u / 10000; // 10,000
981 101 : *buffer++ = '0' + digits;
982 101 : goto sublt10_000;
983 : }
984 40 : if (u < 100000000) { // 100,000,000
985 38 : if (u >= 10000000) goto lt100_000_000;
986 11 : digits = u / 1000000; // 1,000,000
987 11 : *buffer++ = '0' + digits;
988 11 : goto sublt1_000_000;
989 : }
990 : // we already know that u < 1,000,000,000
991 2 : digits = u / 100000000; // 100,000,000
992 2 : *buffer++ = '0' + digits;
993 2 : goto sublt100_000_000;
994 : }
995 :
996 4 : char* FastInt32ToBufferLeft(int32 i, char* buffer) {
997 4 : uint32 u = i;
998 4 : if (i < 0) {
999 0 : *buffer++ = '-';
1000 0 : u = -i;
1001 : }
1002 4 : return FastUInt32ToBufferLeft(u, buffer);
1003 : }
1004 :
1005 1758 : char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1006 : int digits;
1007 1758 : const char *ASCII_digits = NULL;
1008 :
1009 1758 : uint32 u = static_cast<uint32>(u64);
1010 1758 : if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1011 :
1012 24 : uint64 top_11_digits = u64 / 1000000000;
1013 24 : buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1014 24 : u = u64 - (top_11_digits * 1000000000);
1015 :
1016 24 : digits = u / 10000000; // 10,000,000
1017 : GOOGLE_DCHECK_LT(digits, 100);
1018 24 : ASCII_digits = two_ASCII_digits[digits];
1019 24 : buffer[0] = ASCII_digits[0];
1020 24 : buffer[1] = ASCII_digits[1];
1021 24 : buffer += 2;
1022 24 : u -= digits * 10000000; // 10,000,000
1023 24 : digits = u / 100000; // 100,000
1024 24 : ASCII_digits = two_ASCII_digits[digits];
1025 24 : buffer[0] = ASCII_digits[0];
1026 24 : buffer[1] = ASCII_digits[1];
1027 24 : buffer += 2;
1028 24 : u -= digits * 100000; // 100,000
1029 24 : digits = u / 1000; // 1,000
1030 24 : ASCII_digits = two_ASCII_digits[digits];
1031 24 : buffer[0] = ASCII_digits[0];
1032 24 : buffer[1] = ASCII_digits[1];
1033 24 : buffer += 2;
1034 24 : u -= digits * 1000; // 1,000
1035 24 : digits = u / 10;
1036 24 : ASCII_digits = two_ASCII_digits[digits];
1037 24 : buffer[0] = ASCII_digits[0];
1038 24 : buffer[1] = ASCII_digits[1];
1039 24 : buffer += 2;
1040 24 : u -= digits * 10;
1041 24 : digits = u;
1042 24 : *buffer++ = '0' + digits;
1043 24 : *buffer = 0;
1044 24 : return buffer;
1045 : }
1046 :
1047 0 : char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1048 0 : uint64 u = i;
1049 0 : if (i < 0) {
1050 0 : *buffer++ = '-';
1051 0 : u = -i;
1052 : }
1053 0 : return FastUInt64ToBufferLeft(u, buffer);
1054 : }
1055 :
1056 : // ----------------------------------------------------------------------
1057 : // SimpleItoa()
1058 : // Description: converts an integer to a string.
1059 : //
1060 : // Return value: string
1061 : // ----------------------------------------------------------------------
1062 :
1063 36934 : string SimpleItoa(int i) {
1064 : char buffer[kFastToBufferSize];
1065 : return (sizeof(i) == 4) ?
1066 : FastInt32ToBuffer(i, buffer) :
1067 36934 : FastInt64ToBuffer(i, buffer);
1068 : }
1069 :
1070 4324 : string SimpleItoa(unsigned int i) {
1071 : char buffer[kFastToBufferSize];
1072 : return string(buffer, (sizeof(i) == 4) ?
1073 : FastUInt32ToBufferLeft(i, buffer) :
1074 8648 : FastUInt64ToBufferLeft(i, buffer));
1075 : }
1076 :
1077 0 : string SimpleItoa(long i) {
1078 : char buffer[kFastToBufferSize];
1079 : return (sizeof(i) == 4) ?
1080 : FastInt32ToBuffer(i, buffer) :
1081 0 : FastInt64ToBuffer(i, buffer);
1082 : }
1083 :
1084 1531 : string SimpleItoa(unsigned long i) {
1085 : char buffer[kFastToBufferSize];
1086 : return string(buffer, (sizeof(i) == 4) ?
1087 : FastUInt32ToBufferLeft(i, buffer) :
1088 3062 : FastUInt64ToBufferLeft(i, buffer));
1089 : }
1090 :
1091 217 : string SimpleItoa(long long i) {
1092 : char buffer[kFastToBufferSize];
1093 : return (sizeof(i) == 4) ?
1094 : FastInt32ToBuffer(i, buffer) :
1095 217 : FastInt64ToBuffer(i, buffer);
1096 : }
1097 :
1098 203 : string SimpleItoa(unsigned long long i) {
1099 : char buffer[kFastToBufferSize];
1100 : return string(buffer, (sizeof(i) == 4) ?
1101 : FastUInt32ToBufferLeft(i, buffer) :
1102 406 : FastUInt64ToBufferLeft(i, buffer));
1103 : }
1104 :
1105 : // ----------------------------------------------------------------------
1106 : // SimpleDtoa()
1107 : // SimpleFtoa()
1108 : // DoubleToBuffer()
1109 : // FloatToBuffer()
1110 : // We want to print the value without losing precision, but we also do
1111 : // not want to print more digits than necessary. This turns out to be
1112 : // trickier than it sounds. Numbers like 0.2 cannot be represented
1113 : // exactly in binary. If we print 0.2 with a very large precision,
1114 : // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1115 : // On the other hand, if we set the precision too low, we lose
1116 : // significant digits when printing numbers that actually need them.
1117 : // It turns out there is no precision value that does the right thing
1118 : // for all numbers.
1119 : //
1120 : // Our strategy is to first try printing with a precision that is never
1121 : // over-precise, then parse the result with strtod() to see if it
1122 : // matches. If not, we print again with a precision that will always
1123 : // give a precise result, but may use more digits than necessary.
1124 : //
1125 : // An arguably better strategy would be to use the algorithm described
1126 : // in "How to Print Floating-Point Numbers Accurately" by Steele &
1127 : // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1128 : // however, that the following implementation is about as fast as
1129 : // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1130 : // will not scale well on multi-core machines. DMG's code is slightly
1131 : // more accurate (in that it will never use more digits than
1132 : // necessary), but this is probably irrelevant for most users.
1133 : //
1134 : // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1135 : // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1136 : // one in that it makes guesses and then uses strtod() to check them.
1137 : // Their implementation is faster because they use their own code to
1138 : // generate the digits in the first place rather than use snprintf(),
1139 : // thus avoiding format string parsing overhead. However, this makes
1140 : // it considerably more complicated than the following implementation,
1141 : // and it is embedded in a larger library. If speed turns out to be
1142 : // an issue, we could re-implement this in terms of their
1143 : // implementation.
1144 : // ----------------------------------------------------------------------
1145 :
1146 140 : string SimpleDtoa(double value) {
1147 : char buffer[kDoubleToBufferSize];
1148 140 : return DoubleToBuffer(value, buffer);
1149 : }
1150 :
1151 151 : string SimpleFtoa(float value) {
1152 : char buffer[kFloatToBufferSize];
1153 151 : return FloatToBuffer(value, buffer);
1154 : }
1155 :
1156 : static inline bool IsValidFloatChar(char c) {
1157 1204 : return ('0' <= c && c <= '9') ||
1158 1044 : c == 'e' || c == 'E' ||
1159 1035 : c == '+' || c == '-';
1160 : }
1161 :
1162 237 : void DelocalizeRadix(char* buffer) {
1163 : // Fast check: if the buffer has a normal decimal point, assume no
1164 : // translation is needed.
1165 237 : if (strchr(buffer, '.') != NULL) return;
1166 :
1167 : // Find the first unknown character.
1168 1020 : while (IsValidFloatChar(*buffer)) ++buffer;
1169 :
1170 184 : if (*buffer == '\0') {
1171 : // No radix character found.
1172 : return;
1173 : }
1174 :
1175 : // We are now pointing at the locale-specific radix character. Replace it
1176 : // with '.'.
1177 0 : *buffer = '.';
1178 0 : ++buffer;
1179 :
1180 0 : if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1181 : // It appears the radix was a multi-byte character. We need to remove the
1182 : // extra bytes.
1183 : char* target = buffer;
1184 0 : do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1185 0 : memmove(target, buffer, strlen(buffer) + 1);
1186 : }
1187 : }
1188 :
1189 140 : char* DoubleToBuffer(double value, char* buffer) {
1190 : // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1191 : // platforms these days. Just in case some system exists where DBL_DIG
1192 : // is significantly larger -- and risks overflowing our buffer -- we have
1193 : // this assert.
1194 : GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1195 :
1196 140 : if (value == numeric_limits<double>::infinity()) {
1197 : strcpy(buffer, "inf");
1198 12 : return buffer;
1199 128 : } else if (value == -numeric_limits<double>::infinity()) {
1200 : strcpy(buffer, "-inf");
1201 8 : return buffer;
1202 120 : } else if (MathLimits<double>::IsNaN(value)) {
1203 : strcpy(buffer, "nan");
1204 10 : return buffer;
1205 : }
1206 :
1207 : int snprintf_result =
1208 110 : snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1209 :
1210 : // The snprintf should never overflow because the buffer is significantly
1211 : // larger than the precision we asked for.
1212 : GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1213 :
1214 : // We need to make parsed_value volatile in order to force the compiler to
1215 : // write it out to the stack. Otherwise, it may keep the value in a
1216 : // register, and if it does that, it may keep it as a long double instead
1217 : // of a double. This long double may have extra bits that make it compare
1218 : // unequal to "value" even though it would be exactly equal if it were
1219 : // truncated to a double.
1220 110 : volatile double parsed_value = strtod(buffer, NULL);
1221 110 : if (parsed_value != value) {
1222 : int snprintf_result =
1223 0 : snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1224 :
1225 : // Should never overflow; see above.
1226 : GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1227 : }
1228 :
1229 110 : DelocalizeRadix(buffer);
1230 : return buffer;
1231 : }
1232 :
1233 0 : static int memcasecmp(const char *s1, const char *s2, size_t len) {
1234 0 : const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1235 0 : const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1236 :
1237 0 : for ( int i = 0; i < len; i++ ) {
1238 : const int diff =
1239 0 : static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1240 0 : static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1241 0 : if (diff != 0) return diff;
1242 : }
1243 : return 0;
1244 : }
1245 :
1246 : inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1247 0 : if (s1.size() != s2.size()) return false;
1248 0 : return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1249 : }
1250 :
1251 0 : bool safe_strtob(StringPiece str, bool* value) {
1252 0 : GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
1253 0 : if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1254 0 : CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1255 0 : CaseEqual(str, "1")) {
1256 0 : *value = true;
1257 0 : return true;
1258 : }
1259 0 : if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1260 0 : CaseEqual(str, "no") || CaseEqual(str, "n") ||
1261 0 : CaseEqual(str, "0")) {
1262 0 : *value = false;
1263 0 : return true;
1264 : }
1265 : return false;
1266 : }
1267 :
1268 127 : bool safe_strtof(const char* str, float* value) {
1269 : char* endptr;
1270 127 : errno = 0; // errno only gets set on errors
1271 : #if defined(_WIN32) || defined (__hpux) // has no strtof()
1272 : *value = strtod(str, &endptr);
1273 : #else
1274 127 : *value = strtof(str, &endptr);
1275 : #endif
1276 127 : return *str != 0 && *endptr == 0 && errno == 0;
1277 : }
1278 :
1279 0 : bool safe_strtod(const char* str, double* value) {
1280 : char* endptr;
1281 0 : *value = strtod(str, &endptr);
1282 0 : if (endptr != str) {
1283 0 : while (ascii_isspace(*endptr)) ++endptr;
1284 : }
1285 : // Ignore range errors from strtod. The values it
1286 : // returns on underflow and overflow are the right
1287 : // fallback in a robust setting.
1288 0 : return *str != '\0' && *endptr == '\0';
1289 : }
1290 :
1291 0 : bool safe_strto32(const string& str, int32* value) {
1292 0 : return safe_int_internal(str, value);
1293 : }
1294 :
1295 0 : bool safe_strtou32(const string& str, uint32* value) {
1296 0 : return safe_uint_internal(str, value);
1297 : }
1298 :
1299 0 : bool safe_strto64(const string& str, int64* value) {
1300 0 : return safe_int_internal(str, value);
1301 : }
1302 :
1303 0 : bool safe_strtou64(const string& str, uint64* value) {
1304 0 : return safe_uint_internal(str, value);
1305 : }
1306 :
1307 151 : char* FloatToBuffer(float value, char* buffer) {
1308 : // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1309 : // platforms these days. Just in case some system exists where FLT_DIG
1310 : // is significantly larger -- and risks overflowing our buffer -- we have
1311 : // this assert.
1312 : GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1313 :
1314 151 : if (value == numeric_limits<double>::infinity()) {
1315 : strcpy(buffer, "inf");
1316 8 : return buffer;
1317 143 : } else if (value == -numeric_limits<double>::infinity()) {
1318 : strcpy(buffer, "-inf");
1319 8 : return buffer;
1320 135 : } else if (MathLimits<float>::IsNaN(value)) {
1321 : strcpy(buffer, "nan");
1322 8 : return buffer;
1323 : }
1324 :
1325 : int snprintf_result =
1326 254 : snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1327 :
1328 : // The snprintf should never overflow because the buffer is significantly
1329 : // larger than the precision we asked for.
1330 : GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1331 :
1332 : float parsed_value;
1333 127 : if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1334 : int snprintf_result =
1335 0 : snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value);
1336 :
1337 : // Should never overflow; see above.
1338 : GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1339 : }
1340 :
1341 127 : DelocalizeRadix(buffer);
1342 : return buffer;
1343 : }
1344 :
1345 : namespace strings {
1346 :
1347 12771 : AlphaNum::AlphaNum(strings::Hex hex) {
1348 12771 : char *const end = &digits[kFastToBufferSize];
1349 12771 : char *writer = end;
1350 12771 : uint64 value = hex.value;
1351 12771 : uint64 width = hex.spec;
1352 : // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1353 : // where 0x10000 is the smallest hex number that is as wide as the user
1354 : // asked for.
1355 12771 : uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1356 : static const char hexdigits[] = "0123456789abcdef";
1357 42510 : do {
1358 42510 : *--writer = hexdigits[value & 0xF];
1359 42510 : value >>= 4;
1360 42510 : mask >>= 4;
1361 : } while (mask != 0);
1362 12771 : piece_data_ = writer;
1363 12771 : piece_size_ = end - writer;
1364 12771 : }
1365 :
1366 : } // namespace strings
1367 :
1368 : // ----------------------------------------------------------------------
1369 : // StrCat()
1370 : // This merges the given strings or integers, with no delimiter. This
1371 : // is designed to be the fastest possible way to construct a string out
1372 : // of a mix of raw C strings, C++ strings, and integer values.
1373 : // ----------------------------------------------------------------------
1374 :
1375 : // Append is merely a version of memcpy that returns the address of the byte
1376 : // after the area just overwritten. It comes in multiple flavors to minimize
1377 : // call overhead.
1378 29385 : static char *Append1(char *out, const AlphaNum &x) {
1379 19590 : memcpy(out, x.data(), x.size());
1380 9795 : return out + x.size();
1381 : }
1382 :
1383 120198 : static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1384 40066 : memcpy(out, x1.data(), x1.size());
1385 20033 : out += x1.size();
1386 :
1387 40066 : memcpy(out, x2.data(), x2.size());
1388 20033 : return out + x2.size();
1389 : }
1390 :
1391 55 : static char *Append4(char *out,
1392 330 : const AlphaNum &x1, const AlphaNum &x2,
1393 330 : const AlphaNum &x3, const AlphaNum &x4) {
1394 110 : memcpy(out, x1.data(), x1.size());
1395 55 : out += x1.size();
1396 :
1397 110 : memcpy(out, x2.data(), x2.size());
1398 55 : out += x2.size();
1399 :
1400 110 : memcpy(out, x3.data(), x3.size());
1401 55 : out += x3.size();
1402 :
1403 110 : memcpy(out, x4.data(), x4.size());
1404 55 : return out + x4.size();
1405 : }
1406 :
1407 885 : string StrCat(const AlphaNum &a, const AlphaNum &b) {
1408 : string result;
1409 590 : result.resize(a.size() + b.size());
1410 295 : char *const begin = &*result.begin();
1411 295 : char *out = Append2(begin, a, b);
1412 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1413 295 : return result;
1414 : }
1415 :
1416 39164 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1417 : string result;
1418 29373 : result.resize(a.size() + b.size() + c.size());
1419 9791 : char *const begin = &*result.begin();
1420 9791 : char *out = Append2(begin, a, b);
1421 9791 : out = Append1(out, c);
1422 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1423 9791 : return result;
1424 : }
1425 :
1426 204 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1427 51 : const AlphaNum &d) {
1428 : string result;
1429 204 : result.resize(a.size() + b.size() + c.size() + d.size());
1430 51 : char *const begin = &*result.begin();
1431 51 : char *out = Append4(begin, a, b, c, d);
1432 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1433 51 : return result;
1434 : }
1435 :
1436 0 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1437 0 : const AlphaNum &d, const AlphaNum &e) {
1438 : string result;
1439 0 : result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1440 0 : char *const begin = &*result.begin();
1441 0 : char *out = Append4(begin, a, b, c, d);
1442 0 : out = Append1(out, e);
1443 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1444 0 : return result;
1445 : }
1446 :
1447 0 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1448 0 : const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1449 : string result;
1450 0 : result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1451 0 : f.size());
1452 0 : char *const begin = &*result.begin();
1453 0 : char *out = Append4(begin, a, b, c, d);
1454 0 : out = Append2(out, e, f);
1455 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1456 0 : return result;
1457 : }
1458 :
1459 16 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1460 12 : const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1461 4 : const AlphaNum &g) {
1462 : string result;
1463 24 : result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1464 8 : f.size() + g.size());
1465 4 : char *const begin = &*result.begin();
1466 4 : char *out = Append4(begin, a, b, c, d);
1467 4 : out = Append2(out, e, f);
1468 4 : out = Append1(out, g);
1469 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1470 4 : return result;
1471 : }
1472 :
1473 0 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1474 0 : const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1475 0 : const AlphaNum &g, const AlphaNum &h) {
1476 : string result;
1477 0 : result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1478 0 : f.size() + g.size() + h.size());
1479 0 : char *const begin = &*result.begin();
1480 0 : char *out = Append4(begin, a, b, c, d);
1481 0 : out = Append4(out, e, f, g, h);
1482 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1483 0 : return result;
1484 : }
1485 :
1486 0 : string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1487 0 : const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1488 0 : const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1489 : string result;
1490 0 : result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1491 0 : f.size() + g.size() + h.size() + i.size());
1492 0 : char *const begin = &*result.begin();
1493 0 : char *out = Append4(begin, a, b, c, d);
1494 0 : out = Append4(out, e, f, g, h);
1495 0 : out = Append1(out, i);
1496 : GOOGLE_DCHECK_EQ(out, begin + result.size());
1497 0 : return result;
1498 : }
1499 :
1500 : // It's possible to call StrAppend with a char * pointer that is partway into
1501 : // the string we're appending to. However the results of this are random.
1502 : // Therefore, check for this in debug mode. Use unsigned math so we only have
1503 : // to do one comparison.
1504 : #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1505 : GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1506 : uintptr_t((dest).size()))
1507 :
1508 3048 : void StrAppend(string *result, const AlphaNum &a) {
1509 : GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1510 3048 : result->append(a.data(), a.size());
1511 1524 : }
1512 :
1513 29829 : void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1514 : GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1515 : GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1516 9943 : string::size_type old_size = result->size();
1517 19886 : result->resize(old_size + a.size() + b.size());
1518 9943 : char *const begin = &*result->begin();
1519 9943 : char *out = Append2(begin + old_size, a, b);
1520 : GOOGLE_DCHECK_EQ(out, begin + result->size());
1521 9943 : }
1522 :
1523 0 : void StrAppend(string *result,
1524 0 : const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1525 : GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1526 : GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1527 : GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1528 0 : string::size_type old_size = result->size();
1529 0 : result->resize(old_size + a.size() + b.size() + c.size());
1530 0 : char *const begin = &*result->begin();
1531 0 : char *out = Append2(begin + old_size, a, b);
1532 0 : out = Append1(out, c);
1533 : GOOGLE_DCHECK_EQ(out, begin + result->size());
1534 0 : }
1535 :
1536 0 : void StrAppend(string *result,
1537 0 : const AlphaNum &a, const AlphaNum &b,
1538 0 : const AlphaNum &c, const AlphaNum &d) {
1539 : GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1540 : GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1541 : GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1542 : GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1543 0 : string::size_type old_size = result->size();
1544 0 : result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1545 0 : char *const begin = &*result->begin();
1546 0 : char *out = Append4(begin + old_size, a, b, c, d);
1547 : GOOGLE_DCHECK_EQ(out, begin + result->size());
1548 0 : }
1549 :
1550 0 : int GlobalReplaceSubstring(const string& substring,
1551 : const string& replacement,
1552 : string* s) {
1553 0 : GOOGLE_CHECK(s != NULL);
1554 0 : if (s->empty() || substring.empty())
1555 : return 0;
1556 : string tmp;
1557 0 : int num_replacements = 0;
1558 0 : int pos = 0;
1559 0 : for (int match_pos = s->find(substring.data(), pos, substring.length());
1560 : match_pos != string::npos;
1561 : pos = match_pos + substring.length(),
1562 0 : match_pos = s->find(substring.data(), pos, substring.length())) {
1563 0 : ++num_replacements;
1564 : // Append the original content before the match.
1565 0 : tmp.append(*s, pos, match_pos - pos);
1566 : // Append the replacement for the match.
1567 0 : tmp.append(replacement.begin(), replacement.end());
1568 : }
1569 : // Append the content after the last match. If no replacements were made, the
1570 : // original string is left untouched.
1571 0 : if (num_replacements > 0) {
1572 0 : tmp.append(*s, pos, s->length() - pos);
1573 0 : s->swap(tmp);
1574 : }
1575 0 : return num_replacements;
1576 : }
1577 :
1578 0 : int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1579 : // Base64 encodes three bytes of input at a time. If the input is not
1580 : // divisible by three, we pad as appropriate.
1581 : //
1582 : // (from http://tools.ietf.org/html/rfc3548)
1583 : // Special processing is performed if fewer than 24 bits are available
1584 : // at the end of the data being encoded. A full encoding quantum is
1585 : // always completed at the end of a quantity. When fewer than 24 input
1586 : // bits are available in an input group, zero bits are added (on the
1587 : // right) to form an integral number of 6-bit groups. Padding at the
1588 : // end of the data is performed using the '=' character. Since all base
1589 : // 64 input is an integral number of octets, only the following cases
1590 : // can arise:
1591 :
1592 :
1593 : // Base64 encodes each three bytes of input into four bytes of output.
1594 0 : int len = (input_len / 3) * 4;
1595 :
1596 0 : if (input_len % 3 == 0) {
1597 : // (from http://tools.ietf.org/html/rfc3548)
1598 : // (1) the final quantum of encoding input is an integral multiple of 24
1599 : // bits; here, the final unit of encoded output will be an integral
1600 : // multiple of 4 characters with no "=" padding,
1601 0 : } else if (input_len % 3 == 1) {
1602 : // (from http://tools.ietf.org/html/rfc3548)
1603 : // (2) the final quantum of encoding input is exactly 8 bits; here, the
1604 : // final unit of encoded output will be two characters followed by two
1605 : // "=" padding characters, or
1606 0 : len += 2;
1607 0 : if (do_padding) {
1608 0 : len += 2;
1609 : }
1610 : } else { // (input_len % 3 == 2)
1611 : // (from http://tools.ietf.org/html/rfc3548)
1612 : // (3) the final quantum of encoding input is exactly 16 bits; here, the
1613 : // final unit of encoded output will be three characters followed by one
1614 : // "=" padding character.
1615 0 : len += 3;
1616 0 : if (do_padding) {
1617 0 : len += 1;
1618 : }
1619 : }
1620 :
1621 : assert(len >= input_len); // make sure we didn't overflow
1622 0 : return len;
1623 : }
1624 :
1625 : // Base64Escape does padding, so this calculation includes padding.
1626 0 : int CalculateBase64EscapedLen(int input_len) {
1627 0 : return CalculateBase64EscapedLen(input_len, true);
1628 : }
1629 :
1630 : // ----------------------------------------------------------------------
1631 : // int Base64Unescape() - base64 decoder
1632 : // int Base64Escape() - base64 encoder
1633 : // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1634 : // int WebSafeBase64Escape() - Google's variation of base64 encoder
1635 : //
1636 : // Check out
1637 : // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1638 : // care about is that...
1639 : // Take the encoded stuff in groups of 4 characters and turn each
1640 : // character into a code 0 to 63 thus:
1641 : // A-Z map to 0 to 25
1642 : // a-z map to 26 to 51
1643 : // 0-9 map to 52 to 61
1644 : // +(- for WebSafe) maps to 62
1645 : // /(_ for WebSafe) maps to 63
1646 : // There will be four numbers, all less than 64 which can be represented
1647 : // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1648 : // Arrange the 6 digit binary numbers into three bytes as such:
1649 : // aaaaaabb bbbbcccc ccdddddd
1650 : // Equals signs (one or two) are used at the end of the encoded block to
1651 : // indicate that the text was not an integer multiple of three bytes long.
1652 : // ----------------------------------------------------------------------
1653 :
1654 0 : int Base64UnescapeInternal(const char *src_param, int szsrc,
1655 : char *dest, int szdest,
1656 : const signed char* unbase64) {
1657 : static const char kPad64Equals = '=';
1658 : static const char kPad64Dot = '.';
1659 :
1660 0 : int decode = 0;
1661 0 : int destidx = 0;
1662 0 : int state = 0;
1663 0 : unsigned int ch = 0;
1664 0 : unsigned int temp = 0;
1665 :
1666 : // If "char" is signed by default, using *src as an array index results in
1667 : // accessing negative array elements. Treat the input as a pointer to
1668 : // unsigned char to avoid this.
1669 0 : const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1670 :
1671 : // The GET_INPUT macro gets the next input character, skipping
1672 : // over any whitespace, and stopping when we reach the end of the
1673 : // string or when we read any non-data character. The arguments are
1674 : // an arbitrary identifier (used as a label for goto) and the number
1675 : // of data bytes that must remain in the input to avoid aborting the
1676 : // loop.
1677 : #define GET_INPUT(label, remain) \
1678 : label: \
1679 : --szsrc; \
1680 : ch = *src++; \
1681 : decode = unbase64[ch]; \
1682 : if (decode < 0) { \
1683 : if (ascii_isspace(ch) && szsrc >= remain) \
1684 : goto label; \
1685 : state = 4 - remain; \
1686 : break; \
1687 : }
1688 :
1689 : // if dest is null, we're just checking to see if it's legal input
1690 : // rather than producing output. (I suspect this could just be done
1691 : // with a regexp...). We duplicate the loop so this test can be
1692 : // outside it instead of in every iteration.
1693 :
1694 0 : if (dest) {
1695 : // This loop consumes 4 input bytes and produces 3 output bytes
1696 : // per iteration. We can't know at the start that there is enough
1697 : // data left in the string for a full iteration, so the loop may
1698 : // break out in the middle; if so 'state' will be set to the
1699 : // number of input bytes read.
1700 :
1701 0 : while (szsrc >= 4) {
1702 : // We'll start by optimistically assuming that the next four
1703 : // bytes of the string (src[0..3]) are four good data bytes
1704 : // (that is, no nulls, whitespace, padding chars, or illegal
1705 : // chars). We need to test src[0..2] for nulls individually
1706 : // before constructing temp to preserve the property that we
1707 : // never read past a null in the string (no matter how long
1708 : // szsrc claims the string is).
1709 :
1710 0 : if (!src[0] || !src[1] || !src[2] ||
1711 0 : (temp = ((unsigned(unbase64[src[0]]) << 18) |
1712 0 : (unsigned(unbase64[src[1]]) << 12) |
1713 0 : (unsigned(unbase64[src[2]]) << 6) |
1714 0 : (unsigned(unbase64[src[3]])))) & 0x80000000) {
1715 : // Iff any of those four characters was bad (null, illegal,
1716 : // whitespace, padding), then temp's high bit will be set
1717 : // (because unbase64[] is -1 for all bad characters).
1718 : //
1719 : // We'll back up and resort to the slower decoder, which knows
1720 : // how to handle those cases.
1721 :
1722 0 : GET_INPUT(first, 4);
1723 0 : temp = decode;
1724 0 : GET_INPUT(second, 3);
1725 0 : temp = (temp << 6) | decode;
1726 0 : GET_INPUT(third, 2);
1727 0 : temp = (temp << 6) | decode;
1728 0 : GET_INPUT(fourth, 1);
1729 0 : temp = (temp << 6) | decode;
1730 : } else {
1731 : // We really did have four good data bytes, so advance four
1732 : // characters in the string.
1733 :
1734 0 : szsrc -= 4;
1735 0 : src += 4;
1736 0 : decode = -1;
1737 0 : ch = '\0';
1738 : }
1739 :
1740 : // temp has 24 bits of input, so write that out as three bytes.
1741 :
1742 0 : if (destidx+3 > szdest) return -1;
1743 0 : dest[destidx+2] = temp;
1744 0 : temp >>= 8;
1745 0 : dest[destidx+1] = temp;
1746 0 : temp >>= 8;
1747 0 : dest[destidx] = temp;
1748 0 : destidx += 3;
1749 : }
1750 : } else {
1751 0 : while (szsrc >= 4) {
1752 0 : if (!src[0] || !src[1] || !src[2] ||
1753 0 : (temp = ((unsigned(unbase64[src[0]]) << 18) |
1754 0 : (unsigned(unbase64[src[1]]) << 12) |
1755 0 : (unsigned(unbase64[src[2]]) << 6) |
1756 0 : (unsigned(unbase64[src[3]])))) & 0x80000000) {
1757 0 : GET_INPUT(first_no_dest, 4);
1758 0 : GET_INPUT(second_no_dest, 3);
1759 0 : GET_INPUT(third_no_dest, 2);
1760 0 : GET_INPUT(fourth_no_dest, 1);
1761 : } else {
1762 0 : szsrc -= 4;
1763 0 : src += 4;
1764 0 : decode = -1;
1765 0 : ch = '\0';
1766 : }
1767 0 : destidx += 3;
1768 : }
1769 : }
1770 :
1771 : #undef GET_INPUT
1772 :
1773 : // if the loop terminated because we read a bad character, return
1774 : // now.
1775 0 : if (decode < 0 && ch != '\0' &&
1776 0 : ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1777 : return -1;
1778 :
1779 0 : if (ch == kPad64Equals || ch == kPad64Dot) {
1780 : // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1781 : // look at it again when we count to check for the proper number of
1782 : // equals signs at the end.
1783 0 : ++szsrc;
1784 0 : --src;
1785 : } else {
1786 : // This loop consumes 1 input byte per iteration. It's used to
1787 : // clean up the 0-3 input bytes remaining when the first, faster
1788 : // loop finishes. 'temp' contains the data from 'state' input
1789 : // characters read by the first loop.
1790 0 : while (szsrc > 0) {
1791 0 : --szsrc;
1792 0 : ch = *src++;
1793 0 : decode = unbase64[ch];
1794 0 : if (decode < 0) {
1795 0 : if (ascii_isspace(ch)) {
1796 : continue;
1797 0 : } else if (ch == '\0') {
1798 : break;
1799 0 : } else if (ch == kPad64Equals || ch == kPad64Dot) {
1800 : // back up one character; we'll read it again when we check
1801 : // for the correct number of pad characters at the end.
1802 : ++szsrc;
1803 : --src;
1804 : break;
1805 : } else {
1806 : return -1;
1807 : }
1808 : }
1809 :
1810 : // Each input character gives us six bits of output.
1811 0 : temp = (temp << 6) | decode;
1812 0 : ++state;
1813 0 : if (state == 4) {
1814 : // If we've accumulated 24 bits of output, write that out as
1815 : // three bytes.
1816 0 : if (dest) {
1817 0 : if (destidx+3 > szdest) return -1;
1818 0 : dest[destidx+2] = temp;
1819 0 : temp >>= 8;
1820 0 : dest[destidx+1] = temp;
1821 0 : temp >>= 8;
1822 0 : dest[destidx] = temp;
1823 : }
1824 0 : destidx += 3;
1825 0 : state = 0;
1826 0 : temp = 0;
1827 : }
1828 : }
1829 : }
1830 :
1831 : // Process the leftover data contained in 'temp' at the end of the input.
1832 0 : int expected_equals = 0;
1833 0 : switch (state) {
1834 : case 0:
1835 : // Nothing left over; output is a multiple of 3 bytes.
1836 : break;
1837 :
1838 : case 1:
1839 : // Bad input; we have 6 bits left over.
1840 : return -1;
1841 :
1842 : case 2:
1843 : // Produce one more output byte from the 12 input bits we have left.
1844 0 : if (dest) {
1845 0 : if (destidx+1 > szdest) return -1;
1846 0 : temp >>= 4;
1847 0 : dest[destidx] = temp;
1848 : }
1849 0 : ++destidx;
1850 0 : expected_equals = 2;
1851 0 : break;
1852 :
1853 : case 3:
1854 : // Produce two more output bytes from the 18 input bits we have left.
1855 0 : if (dest) {
1856 0 : if (destidx+2 > szdest) return -1;
1857 0 : temp >>= 2;
1858 0 : dest[destidx+1] = temp;
1859 0 : temp >>= 8;
1860 0 : dest[destidx] = temp;
1861 : }
1862 0 : destidx += 2;
1863 0 : expected_equals = 1;
1864 0 : break;
1865 :
1866 : default:
1867 : // state should have no other values at this point.
1868 0 : GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1869 : }
1870 :
1871 : // The remainder of the string should be all whitespace, mixed with
1872 : // exactly 0 equals signs, or exactly 'expected_equals' equals
1873 : // signs. (Always accepting 0 equals signs is a google extension
1874 : // not covered in the RFC, as is accepting dot as the pad character.)
1875 :
1876 0 : int equals = 0;
1877 0 : while (szsrc > 0 && *src) {
1878 0 : if (*src == kPad64Equals || *src == kPad64Dot)
1879 0 : ++equals;
1880 0 : else if (!ascii_isspace(*src))
1881 : return -1;
1882 0 : --szsrc;
1883 0 : ++src;
1884 : }
1885 :
1886 0 : return (equals == 0 || equals == expected_equals) ? destidx : -1;
1887 : }
1888 :
1889 : // The arrays below were generated by the following code
1890 : // #include <sys/time.h>
1891 : // #include <stdlib.h>
1892 : // #include <string.h>
1893 : // main()
1894 : // {
1895 : // static const char Base64[] =
1896 : // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1897 : // char *pos;
1898 : // int idx, i, j;
1899 : // printf(" ");
1900 : // for (i = 0; i < 255; i += 8) {
1901 : // for (j = i; j < i + 8; j++) {
1902 : // pos = strchr(Base64, j);
1903 : // if ((pos == NULL) || (j == 0))
1904 : // idx = -1;
1905 : // else
1906 : // idx = pos - Base64;
1907 : // if (idx == -1)
1908 : // printf(" %2d, ", idx);
1909 : // else
1910 : // printf(" %2d/*%c*/,", idx, j);
1911 : // }
1912 : // printf("\n ");
1913 : // }
1914 : // }
1915 : //
1916 : // where the value of "Base64[]" was replaced by one of the base-64 conversion
1917 : // tables from the functions below.
1918 : static const signed char kUnBase64[] = {
1919 : -1, -1, -1, -1, -1, -1, -1, -1,
1920 : -1, -1, -1, -1, -1, -1, -1, -1,
1921 : -1, -1, -1, -1, -1, -1, -1, -1,
1922 : -1, -1, -1, -1, -1, -1, -1, -1,
1923 : -1, -1, -1, -1, -1, -1, -1, -1,
1924 : -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1925 : 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1926 : 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1927 : -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1928 : 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1929 : 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1930 : 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
1931 : -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1932 : 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1933 : 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1934 : 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
1935 : -1, -1, -1, -1, -1, -1, -1, -1,
1936 : -1, -1, -1, -1, -1, -1, -1, -1,
1937 : -1, -1, -1, -1, -1, -1, -1, -1,
1938 : -1, -1, -1, -1, -1, -1, -1, -1,
1939 : -1, -1, -1, -1, -1, -1, -1, -1,
1940 : -1, -1, -1, -1, -1, -1, -1, -1,
1941 : -1, -1, -1, -1, -1, -1, -1, -1,
1942 : -1, -1, -1, -1, -1, -1, -1, -1,
1943 : -1, -1, -1, -1, -1, -1, -1, -1,
1944 : -1, -1, -1, -1, -1, -1, -1, -1,
1945 : -1, -1, -1, -1, -1, -1, -1, -1,
1946 : -1, -1, -1, -1, -1, -1, -1, -1,
1947 : -1, -1, -1, -1, -1, -1, -1, -1,
1948 : -1, -1, -1, -1, -1, -1, -1, -1,
1949 : -1, -1, -1, -1, -1, -1, -1, -1,
1950 : -1, -1, -1, -1, -1, -1, -1, -1
1951 : };
1952 : static const signed char kUnWebSafeBase64[] = {
1953 : -1, -1, -1, -1, -1, -1, -1, -1,
1954 : -1, -1, -1, -1, -1, -1, -1, -1,
1955 : -1, -1, -1, -1, -1, -1, -1, -1,
1956 : -1, -1, -1, -1, -1, -1, -1, -1,
1957 : -1, -1, -1, -1, -1, -1, -1, -1,
1958 : -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
1959 : 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1960 : 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1961 : -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1962 : 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1963 : 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1964 : 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
1965 : -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1966 : 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1967 : 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1968 : 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
1969 : -1, -1, -1, -1, -1, -1, -1, -1,
1970 : -1, -1, -1, -1, -1, -1, -1, -1,
1971 : -1, -1, -1, -1, -1, -1, -1, -1,
1972 : -1, -1, -1, -1, -1, -1, -1, -1,
1973 : -1, -1, -1, -1, -1, -1, -1, -1,
1974 : -1, -1, -1, -1, -1, -1, -1, -1,
1975 : -1, -1, -1, -1, -1, -1, -1, -1,
1976 : -1, -1, -1, -1, -1, -1, -1, -1,
1977 : -1, -1, -1, -1, -1, -1, -1, -1,
1978 : -1, -1, -1, -1, -1, -1, -1, -1,
1979 : -1, -1, -1, -1, -1, -1, -1, -1,
1980 : -1, -1, -1, -1, -1, -1, -1, -1,
1981 : -1, -1, -1, -1, -1, -1, -1, -1,
1982 : -1, -1, -1, -1, -1, -1, -1, -1,
1983 : -1, -1, -1, -1, -1, -1, -1, -1,
1984 : -1, -1, -1, -1, -1, -1, -1, -1
1985 : };
1986 :
1987 0 : int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
1988 0 : return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
1989 : }
1990 :
1991 0 : static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
1992 : const signed char* unbase64) {
1993 : // Determine the size of the output string. Base64 encodes every 3 bytes into
1994 : // 4 characters. any leftover chars are added directly for good measure.
1995 : // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
1996 0 : const int dest_len = 3 * (slen / 4) + (slen % 4);
1997 :
1998 0 : dest->resize(dest_len);
1999 :
2000 : // We are getting the destination buffer by getting the beginning of the
2001 : // string and converting it into a char *.
2002 : const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2003 0 : dest_len, unbase64);
2004 0 : if (len < 0) {
2005 : dest->clear();
2006 0 : return false;
2007 : }
2008 :
2009 : // could be shorter if there was padding
2010 : GOOGLE_DCHECK_LE(len, dest_len);
2011 0 : dest->erase(len);
2012 :
2013 0 : return true;
2014 : }
2015 :
2016 0 : bool Base64Unescape(StringPiece src, string* dest) {
2017 0 : return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2018 : }
2019 :
2020 0 : bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2021 0 : return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2022 : }
2023 :
2024 0 : int Base64EscapeInternal(const unsigned char *src, int szsrc,
2025 : char *dest, int szdest, const char *base64,
2026 : bool do_padding) {
2027 : static const char kPad64 = '=';
2028 :
2029 0 : if (szsrc <= 0) return 0;
2030 :
2031 0 : if (szsrc * 4 > szdest * 3) return 0;
2032 :
2033 0 : char *cur_dest = dest;
2034 0 : const unsigned char *cur_src = src;
2035 :
2036 0 : char *limit_dest = dest + szdest;
2037 0 : const unsigned char *limit_src = src + szsrc;
2038 :
2039 : // Three bytes of data encodes to four characters of cyphertext.
2040 : // So we can pump through three-byte chunks atomically.
2041 0 : while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2042 0 : uint32 in = BigEndian::Load32(cur_src) >> 8;
2043 :
2044 0 : cur_dest[0] = base64[in >> 18];
2045 0 : in &= 0x3FFFF;
2046 0 : cur_dest[1] = base64[in >> 12];
2047 0 : in &= 0xFFF;
2048 0 : cur_dest[2] = base64[in >> 6];
2049 0 : in &= 0x3F;
2050 0 : cur_dest[3] = base64[in];
2051 :
2052 0 : cur_dest += 4;
2053 0 : cur_src += 3;
2054 : }
2055 : // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2056 0 : szdest = limit_dest - cur_dest;
2057 0 : szsrc = limit_src - cur_src;
2058 :
2059 : /* now deal with the tail (<=3 bytes) */
2060 0 : switch (szsrc) {
2061 : case 0:
2062 : // Nothing left; nothing more to do.
2063 : break;
2064 : case 1: {
2065 : // One byte left: this encodes to two characters, and (optionally)
2066 : // two pad characters to round out the four-character cypherblock.
2067 0 : if ((szdest -= 2) < 0) return 0;
2068 0 : uint32 in = cur_src[0];
2069 0 : cur_dest[0] = base64[in >> 2];
2070 0 : in &= 0x3;
2071 0 : cur_dest[1] = base64[in << 4];
2072 0 : cur_dest += 2;
2073 0 : if (do_padding) {
2074 0 : if ((szdest -= 2) < 0) return 0;
2075 0 : cur_dest[0] = kPad64;
2076 0 : cur_dest[1] = kPad64;
2077 0 : cur_dest += 2;
2078 : }
2079 : break;
2080 : }
2081 : case 2: {
2082 : // Two bytes left: this encodes to three characters, and (optionally)
2083 : // one pad character to round out the four-character cypherblock.
2084 0 : if ((szdest -= 3) < 0) return 0;
2085 0 : uint32 in = BigEndian::Load16(cur_src);
2086 0 : cur_dest[0] = base64[in >> 10];
2087 0 : in &= 0x3FF;
2088 0 : cur_dest[1] = base64[in >> 4];
2089 0 : in &= 0x00F;
2090 0 : cur_dest[2] = base64[in << 2];
2091 0 : cur_dest += 3;
2092 0 : if (do_padding) {
2093 0 : if ((szdest -= 1) < 0) return 0;
2094 0 : cur_dest[0] = kPad64;
2095 0 : cur_dest += 1;
2096 : }
2097 : break;
2098 : }
2099 : case 3: {
2100 : // Three bytes left: same as in the big loop above. We can't do this in
2101 : // the loop because the loop above always reads 4 bytes, and the fourth
2102 : // byte is past the end of the input.
2103 0 : if ((szdest -= 4) < 0) return 0;
2104 0 : uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2105 0 : cur_dest[0] = base64[in >> 18];
2106 0 : in &= 0x3FFFF;
2107 0 : cur_dest[1] = base64[in >> 12];
2108 0 : in &= 0xFFF;
2109 0 : cur_dest[2] = base64[in >> 6];
2110 0 : in &= 0x3F;
2111 0 : cur_dest[3] = base64[in];
2112 0 : cur_dest += 4;
2113 0 : break;
2114 : }
2115 : default:
2116 : // Should not be reached: blocks of 4 bytes are handled
2117 : // in the while loop before this switch statement.
2118 0 : GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2119 0 : break;
2120 : }
2121 0 : return (cur_dest - dest);
2122 : }
2123 :
2124 : static const char kBase64Chars[] =
2125 : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2126 :
2127 : static const char kWebSafeBase64Chars[] =
2128 : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2129 :
2130 0 : int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2131 0 : return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2132 : }
2133 0 : int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2134 : int szdest, bool do_padding) {
2135 : return Base64EscapeInternal(src, szsrc, dest, szdest,
2136 0 : kWebSafeBase64Chars, do_padding);
2137 : }
2138 :
2139 0 : void Base64EscapeInternal(const unsigned char* src, int szsrc,
2140 : string* dest, bool do_padding,
2141 : const char* base64_chars) {
2142 : const int calc_escaped_size =
2143 0 : CalculateBase64EscapedLen(szsrc, do_padding);
2144 0 : dest->resize(calc_escaped_size);
2145 : const int escaped_len = Base64EscapeInternal(src, szsrc,
2146 : string_as_array(dest),
2147 : dest->size(),
2148 : base64_chars,
2149 0 : do_padding);
2150 : GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2151 0 : dest->erase(escaped_len);
2152 0 : }
2153 :
2154 0 : void Base64Escape(const unsigned char *src, int szsrc,
2155 : string* dest, bool do_padding) {
2156 0 : Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2157 0 : }
2158 :
2159 0 : void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2160 : string *dest, bool do_padding) {
2161 0 : Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2162 0 : }
2163 :
2164 0 : void Base64Escape(StringPiece src, string* dest) {
2165 0 : Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2166 0 : src.size(), dest, true);
2167 0 : }
2168 :
2169 0 : void WebSafeBase64Escape(StringPiece src, string* dest) {
2170 0 : WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2171 0 : src.size(), dest, false);
2172 0 : }
2173 :
2174 0 : void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2175 0 : WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2176 0 : src.size(), dest, true);
2177 0 : }
2178 :
2179 : // Helper to append a Unicode code point to a string as UTF8, without bringing
2180 : // in any external dependencies.
2181 0 : int EncodeAsUTF8Char(uint32 code_point, char* output) {
2182 0 : uint32 tmp = 0;
2183 0 : int len = 0;
2184 0 : if (code_point <= 0x7f) {
2185 0 : tmp = code_point;
2186 0 : len = 1;
2187 0 : } else if (code_point <= 0x07ff) {
2188 : tmp = 0x0000c080 |
2189 0 : ((code_point & 0x07c0) << 2) |
2190 0 : (code_point & 0x003f);
2191 0 : len = 2;
2192 0 : } else if (code_point <= 0xffff) {
2193 : tmp = 0x00e08080 |
2194 0 : ((code_point & 0xf000) << 4) |
2195 0 : ((code_point & 0x0fc0) << 2) |
2196 0 : (code_point & 0x003f);
2197 0 : len = 3;
2198 : } else {
2199 : // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2200 : // normally only defined up to there as well.
2201 : tmp = 0xf0808080 |
2202 0 : ((code_point & 0x1c0000) << 6) |
2203 0 : ((code_point & 0x03f000) << 4) |
2204 0 : ((code_point & 0x000fc0) << 2) |
2205 0 : (code_point & 0x003f);
2206 0 : len = 4;
2207 : }
2208 0 : tmp = ghtonl(tmp);
2209 0 : memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2210 0 : return len;
2211 : }
2212 :
2213 : // Table of UTF-8 character lengths, based on first byte
2214 : static const unsigned char kUTF8LenTbl[256] = {
2215 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2216 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2217 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2218 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2219 :
2220 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2221 : 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2222 : 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2223 : 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2224 : };
2225 :
2226 : // Return length of a single UTF-8 source character
2227 0 : int UTF8FirstLetterNumBytes(const char* src, int len) {
2228 0 : if (len == 0) {
2229 : return 0;
2230 : }
2231 0 : return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2232 : }
2233 :
2234 : } // namespace protobuf
2235 : } // namespace google
|