LCOV - code coverage report
Current view: top level - third_party/protobuf/src/google/protobuf/stubs - structurally_valid.cc (source / functions) Hit Total Coverage
Test: tmp.zDYK9MVh93 Lines: 31 110 28.2 %
Date: 2015-10-10 Functions: 4 6 66.7 %

          Line data    Source code
       1             : // Copyright 2005-2008 Google Inc. All Rights Reserved.
       2             : // Author: jrm@google.com (Jim Meehan)
       3             : 
       4             : #include <google/protobuf/stubs/common.h>
       5             : 
       6             : #include <google/protobuf/stubs/stringpiece.h>
       7             : 
       8             : namespace google {
       9             : namespace protobuf {
      10             : namespace internal {
      11             : 
      12             : // These four-byte entries compactly encode how many bytes 0..255 to delete
      13             : // in making a string replacement, how many bytes to add 0..255, and the offset
      14             : // 0..64k-1 of the replacement string in remap_string.
      15             : struct RemapEntry {
      16             :   uint8 delete_bytes;
      17             :   uint8 add_bytes;
      18             :   uint16 bytes_offset;
      19             : };
      20             : 
      21             : // Exit type codes for state tables. All but the first get stuffed into
      22             : // signed one-byte entries. The first is only generated by executable code.
      23             : // To distinguish from next-state entries, these must be contiguous and
      24             : // all <= kExitNone
      25             : typedef enum {
      26             :   kExitDstSpaceFull = 239,
      27             :   kExitIllegalStructure,  // 240
      28             :   kExitOK,                // 241
      29             :   kExitReject,            // ...
      30             :   kExitReplace1,
      31             :   kExitReplace2,
      32             :   kExitReplace3,
      33             :   kExitReplace21,
      34             :   kExitReplace31,
      35             :   kExitReplace32,
      36             :   kExitReplaceOffset1,
      37             :   kExitReplaceOffset2,
      38             :   kExitReplace1S0,
      39             :   kExitSpecial,
      40             :   kExitDoAgain,
      41             :   kExitRejectAlt,
      42             :   kExitNone               // 255
      43             : } ExitReason;
      44             : 
      45             : 
      46             : // This struct represents one entire state table. The three initialized byte
      47             : // areas are state_table, remap_base, and remap_string. state0 and state0_size
      48             : // give the byte offset and length within state_table of the initial state --
      49             : // table lookups are expected to start and end in this state, but for
      50             : // truncated UTF-8 strings, may end in a different state. These allow a quick
      51             : // test for that condition. entry_shift is 8 for tables subscripted by a full
      52             : // byte value and 6 for space-optimized tables subscripted by only six
      53             : // significant bits in UTF-8 continuation bytes.
      54             : typedef struct {
      55             :   const uint32 state0;
      56             :   const uint32 state0_size;
      57             :   const uint32 total_size;
      58             :   const int max_expand;
      59             :   const int entry_shift;
      60             :   const int bytes_per_entry;
      61             :   const uint32 losub;
      62             :   const uint32 hiadd;
      63             :   const uint8* state_table;
      64             :   const RemapEntry* remap_base;
      65             :   const uint8* remap_string;
      66             :   const uint8* fast_state;
      67             : } UTF8StateMachineObj;
      68             : 
      69             : typedef UTF8StateMachineObj UTF8ScanObj;
      70             : 
      71             : #define X__ (kExitIllegalStructure)
      72             : #define RJ_ (kExitReject)
      73             : #define S1_ (kExitReplace1)
      74             : #define S2_ (kExitReplace2)
      75             : #define S3_ (kExitReplace3)
      76             : #define S21 (kExitReplace21)
      77             : #define S31 (kExitReplace31)
      78             : #define S32 (kExitReplace32)
      79             : #define T1_ (kExitReplaceOffset1)
      80             : #define T2_ (kExitReplaceOffset2)
      81             : #define S11 (kExitReplace1S0)
      82             : #define SP_ (kExitSpecial)
      83             : #define D__ (kExitDoAgain)
      84             : #define RJA (kExitRejectAlt)
      85             : 
      86             : //  Entire table has 9 state blocks of 256 entries each
      87             : static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
      88             : static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
      89             : static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
      90             : static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
      91             : static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
      92             : static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
      93             : static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
      94             : static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
      95             : 
      96             : static const uint8 utf8acceptnonsurrogates[] = {
      97             : // state[0] 0x000000 Byte 1
      98             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
      99             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     100             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     101             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     102             : 
     103             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     104             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     105             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     106             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     107             : 
     108             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     109             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     110             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     111             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     112             : 
     113             : X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     114             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     115             :   2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
     116             :   4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     117             : 
     118             : // state[1] 0x000080 Byte 2 of 2
     119             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     120             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     121             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     122             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     123             : 
     124             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     125             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     126             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     127             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     128             : 
     129             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     130             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     131             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     132             :   0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
     133             : 
     134             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     135             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     136             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     137             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     138             : 
     139             : // state[2] 0x000000 Byte 2 of 3
     140             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     141             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     142             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     143             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     144             : 
     145             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     146             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     147             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     148             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     149             : 
     150             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     151             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     152             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     153             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     154             : 
     155             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     156             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     157             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     158             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     159             : 
     160             : // state[3] 0x001000 Byte 2 of 3
     161             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     162             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     163             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     164             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     165             : 
     166             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     167             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     168             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     169             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     170             : 
     171             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     172             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     173             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     174             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     175             : 
     176             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     177             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     178             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     179             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     180             : 
     181             : // state[4] 0x000000 Byte 2 of 4
     182             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     183             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     184             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     185             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     186             : 
     187             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     188             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     189             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     190             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     191             : 
     192             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     193             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     194             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     195             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     196             : 
     197             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     198             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     199             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     200             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     201             : 
     202             : // state[5] 0x040000 Byte 2 of 4
     203             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     204             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     205             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     206             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     207             : 
     208             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     209             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     210             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     211             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     212             : 
     213             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     214             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     215             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     216             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     217             : 
     218             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     219             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     220             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     221             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     222             : 
     223             : // state[6] 0x100000 Byte 2 of 4
     224             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     225             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     226             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     227             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     228             : 
     229             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     230             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     231             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     232             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     233             : 
     234             :   3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
     235             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     236             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     237             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     238             : 
     239             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     240             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     241             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     242             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     243             : 
     244             : // state[7] 0x00d000 Byte 2 of 3
     245             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     246             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     247             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     248             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     249             : 
     250             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     251             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     252             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     253             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     254             : 
     255             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     256             :   1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
     257             :   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
     258             :   8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
     259             : 
     260             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     261             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     262             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     263             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     264             : 
     265             : // state[8] 0x00d800 Byte 3 of 3
     266             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     267             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     268             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     269             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     270             : 
     271             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     272             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     273             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     274             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     275             : 
     276             : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
     277             : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
     278             : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
     279             : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
     280             : 
     281             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     282             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     283             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     284             : X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
     285             : };
     286             : 
     287             : // Remap base[0] = (del, add, string_offset)
     288             : static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
     289             : {0, 0, 0} };
     290             : 
     291             : // Remap string[0]
     292             : static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
     293             : 0 };
     294             : 
     295             : static const unsigned char utf8acceptnonsurrogates_fast[256] = {
     296             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     297             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     298             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     299             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     300             : 
     301             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     302             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     303             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     304             : 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     305             : 
     306             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     307             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     308             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     309             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     310             : 
     311             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     312             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     313             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     314             : 1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     315             : };
     316             : 
     317             : static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
     318             :   utf8acceptnonsurrogates_STATE0,
     319             :   utf8acceptnonsurrogates_STATE0_SIZE,
     320             :   utf8acceptnonsurrogates_TOTAL_SIZE,
     321             :   utf8acceptnonsurrogates_MAX_EXPAND_X4,
     322             :   utf8acceptnonsurrogates_SHIFT,
     323             :   utf8acceptnonsurrogates_BYTES,
     324             :   utf8acceptnonsurrogates_LOSUB,
     325             :   utf8acceptnonsurrogates_HIADD,
     326             :   utf8acceptnonsurrogates,
     327             :   utf8acceptnonsurrogates_remap_base,
     328             :   utf8acceptnonsurrogates_remap_string,
     329             :   utf8acceptnonsurrogates_fast
     330             : };
     331             : 
     332             : 
     333             : #undef X__
     334             : #undef RJ_
     335             : #undef S1_
     336             : #undef S2_
     337             : #undef S3_
     338             : #undef S21
     339             : #undef S31
     340             : #undef S32
     341             : #undef T1_
     342             : #undef T2_
     343             : #undef S11
     344             : #undef SP_
     345             : #undef D__
     346             : #undef RJA
     347             : 
     348             : // Return true if current Tbl pointer is within state0 range
     349             : // Note that unsigned compare checks both ends of range simultaneously
     350             : static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
     351           0 :   const uint8* Tbl0 = &st->state_table[st->state0];
     352           0 :   return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
     353             : }
     354             : 
     355             : // Scan a UTF-8 string based on state table.
     356             : // Always scan complete UTF-8 characters
     357             : // Set number of bytes scanned. Return reason for exiting
     358      479138 : int UTF8GenericScan(const UTF8ScanObj* st,
     359             :                     const char * str,
     360             :                     int str_length,
     361             :                     int* bytes_consumed) {
     362      479138 :   *bytes_consumed = 0;
     363      479138 :   if (str_length == 0) return kExitOK;
     364             : 
     365           0 :   int eshift = st->entry_shift;
     366           0 :   const uint8* isrc = reinterpret_cast<const uint8*>(str);
     367           0 :   const uint8* src = isrc;
     368           0 :   const uint8* srclimit = isrc + str_length;
     369           0 :   const uint8* srclimit8 = srclimit - 7;
     370           0 :   const uint8* Tbl_0 = &st->state_table[st->state0];
     371             : 
     372             :  DoAgain:
     373             :   // Do state-table scan
     374           0 :   int e = 0;
     375             :   uint8 c;
     376           0 :   const uint8* Tbl2 = &st->fast_state[0];
     377           0 :   const uint32 losub = st->losub;
     378           0 :   const uint32 hiadd = st->hiadd;
     379             :   // Check initial few bytes one at a time until 8-byte aligned
     380             :   //----------------------------
     381           0 :   while ((((uintptr_t)src & 0x07) != 0) &&
     382           0 :          (src < srclimit) &&
     383           0 :          Tbl2[src[0]] == 0) {
     384           0 :     src++;
     385             :   }
     386           0 :   if (((uintptr_t)src & 0x07) == 0) {
     387             :     // Do fast for groups of 8 identity bytes.
     388             :     // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
     389             :     // including slowing slightly on cr/lf/ht
     390             :     //----------------------------
     391           0 :     while (src < srclimit8) {
     392           0 :       uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
     393           0 :       uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
     394           0 :       src += 8;
     395             :       // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
     396           0 :       uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
     397           0 :                     (s4567 - losub) | (s4567 + hiadd);
     398           0 :       if ((temp & 0x80808080) != 0) {
     399             :         // We typically end up here on cr/lf/ht; src was incremented
     400           0 :         int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
     401           0 :                     (Tbl2[src[-6]] | Tbl2[src[-5]]);
     402           0 :         if (e0123 != 0) {
     403             :           src -= 8;
     404             :           break;
     405             :         }    // Exit on Non-interchange
     406           0 :         e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
     407           0 :                 (Tbl2[src[-2]] | Tbl2[src[-1]]);
     408           0 :         if (e0123 != 0) {
     409           0 :           src -= 4;
     410           0 :           break;
     411             :         }    // Exit on Non-interchange
     412             :         // Else OK, go around again
     413             :       }
     414             :     }
     415             :   }
     416             :   //----------------------------
     417             : 
     418             :   // Byte-at-a-time scan
     419             :   //----------------------------
     420           0 :   const uint8* Tbl = Tbl_0;
     421           0 :   while (src < srclimit) {
     422           0 :     c = *src;
     423           0 :     e = Tbl[c];
     424           0 :     src++;
     425           0 :     if (e >= kExitIllegalStructure) {break;}
     426           0 :     Tbl = &Tbl_0[e << eshift];
     427             :   }
     428             :   //----------------------------
     429             : 
     430             : 
     431             :   // Exit posibilities:
     432             :   //  Some exit code, !state0, back up over last char
     433             :   //  Some exit code, state0, back up one byte exactly
     434             :   //  source consumed, !state0, back up over partial char
     435             :   //  source consumed, state0, exit OK
     436             :   // For illegal byte in state0, avoid backup up over PREVIOUS char
     437             :   // For truncated last char, back up to beginning of it
     438             : 
     439           0 :   if (e >= kExitIllegalStructure) {
     440             :     // Back up over exactly one byte of rejected/illegal UTF-8 character
     441           0 :     src--;
     442             :     // Back up more if needed
     443           0 :     if (!InStateZero(st, Tbl)) {
     444           0 :       do {
     445           0 :         src--;
     446           0 :       } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
     447             :     }
     448           0 :   } else if (!InStateZero(st, Tbl)) {
     449             :     // Back up over truncated UTF-8 character
     450             :     e = kExitIllegalStructure;
     451           0 :     do {
     452           0 :       src--;
     453           0 :     } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
     454             :   } else {
     455             :     // Normal termination, source fully consumed
     456             :     e = kExitOK;
     457             :   }
     458             : 
     459           0 :   if (e == kExitDoAgain) {
     460             :     // Loop back up to the fast scan
     461             :     goto DoAgain;
     462             :   }
     463             : 
     464           0 :   *bytes_consumed = src - isrc;
     465           0 :   return e;
     466             : }
     467             : 
     468      479236 : int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
     469             :                     const char * str,
     470             :                     int str_length,
     471             :                     int* bytes_consumed) {
     472      479236 :   *bytes_consumed = 0;
     473      479236 :   if (str_length == 0) return kExitOK;
     474             : 
     475      479168 :   const uint8* isrc =  reinterpret_cast<const uint8*>(str);
     476      479168 :   const uint8* src = isrc;
     477      479168 :   const uint8* srclimit = isrc + str_length;
     478      479168 :   const uint8* srclimit8 = srclimit - 7;
     479             :   int n;
     480             :   int rest_consumed;
     481             :   int exit_reason;
     482      479141 :   do {
     483             :     // Check initial few bytes one at a time until 8-byte aligned
     484      479165 :     while ((((uintptr_t)src & 0x07) != 0) &&
     485           0 :            (src < srclimit) && (src[0] < 0x80)) {
     486           0 :       src++;
     487             :     }
     488      479165 :     if (((uintptr_t)src & 0x07) == 0) {
     489    31453774 :       while ((src < srclimit8) &&
     490    15487319 :              (((reinterpret_cast<const uint32*>(src)[0] |
     491    15487319 :                 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
     492    15487296 :         src += 8;
     493             :       }
     494             :     }
     495     2956197 :     while ((src < srclimit) && (src[0] < 0x80)) {
     496     2477032 :       src++;
     497             :     }
     498             :     // Run state table on the rest
     499      479165 :     n = src - isrc;
     500      479165 :     exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
     501      479141 :     src += rest_consumed;
     502             :   } while ( exit_reason == kExitDoAgain );
     503             : 
     504      479144 :   *bytes_consumed = src - isrc;
     505      479144 :   return exit_reason;
     506             : }
     507             : 
     508             : // Hack:  On some compilers the static tables are initialized at startup.
     509             : //   We can't use them until they are initialized.  However, some Protocol
     510             : //   Buffer parsing happens at static init time and may try to validate
     511             : //   UTF-8 strings.  Since UTF-8 validation is only used for debugging
     512             : //   anyway, we simply always return success if initialization hasn't
     513             : //   occurred yet.
     514             : namespace {
     515             : 
     516             : bool module_initialized_ = false;
     517             : 
     518             : struct InitDetector {
     519             :   InitDetector() {
     520          46 :     module_initialized_ = true;
     521             :   }
     522             : };
     523             : InitDetector init_detector;
     524             : 
     525             : }  // namespace
     526             : 
     527      479256 : bool IsStructurallyValidUTF8(const char* buf, int len) {
     528      479256 :   if (!module_initialized_) return true;
     529             :   
     530      479259 :   int bytes_consumed = 0;
     531             :   UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
     532      479259 :                            buf, len, &bytes_consumed);
     533      479141 :   return (bytes_consumed == len);
     534             : }
     535             : 
     536           0 : int UTF8SpnStructurallyValid(const StringPiece& str) {
     537           0 :   if (!module_initialized_) return str.size();
     538             : 
     539           0 :   int bytes_consumed = 0;
     540             :   UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
     541           0 :                            str.data(), str.size(), &bytes_consumed);
     542           0 :   return bytes_consumed;
     543             : }
     544             : 
     545             : // Coerce UTF-8 byte string in src_str to be
     546             : // a structurally-valid equal-length string by selectively
     547             : // overwriting illegal bytes with replace_char (typically blank).
     548             : // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
     549             : // src_str is read-only. If any overwriting is needed, a modified byte string
     550             : // is created in idst, length isrclen.
     551             : //
     552             : // Returns pointer to output buffer, isrc if no changes were made,
     553             : //  or idst if some bytes were changed.
     554             : //
     555             : // Fast case: all is structurally valid and no byte copying is done.
     556             : //
     557           0 : char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
     558             :                                     char* idst,
     559             :                                     const char replace_char) {
     560           0 :   const char* isrc = src_str.data();
     561           0 :   const int len = src_str.length();
     562           0 :   int n = UTF8SpnStructurallyValid(src_str);
     563           0 :   if (n == len) {               // Normal case -- all is cool, return
     564             :     return const_cast<char*>(isrc);
     565             :   } else {                      // Unusual case -- copy w/o bad bytes
     566           0 :     const char* src = isrc;
     567           0 :     const char* srclimit = isrc + len;
     568           0 :     char* dst = idst;
     569           0 :     memmove(dst, src, n);       // Copy initial good chunk
     570           0 :     src += n;
     571           0 :     dst += n;
     572           0 :     while (src < srclimit) {    // src points to bogus byte or is off the end
     573           0 :       dst[0] = replace_char;                    // replace one bad byte
     574           0 :       src++;
     575           0 :       dst++;
     576           0 :       StringPiece str2(src, srclimit - src);
     577           0 :       n = UTF8SpnStructurallyValid(str2);       // scan the remainder
     578           0 :       memmove(dst, src, n);                     // copy next good chunk
     579           0 :       src += n;
     580           0 :       dst += n;
     581             :     }
     582             :   }
     583             :   return idst;
     584             : }
     585             : 
     586             : }  // namespace internal
     587             : }  // namespace protobuf
     588          92 : }  // namespace google

Generated by: LCOV version 1.10