FIFE
2008.0
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00027 00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00030 00031 #include <iterator> 00032 00033 namespace utf8 00034 { 00035 // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 00036 // You may need to change them to match your system. 00037 // These typedefs have the same names as ones from cstdint, or boost/cstdint 00038 typedef unsigned char uint8_t; 00039 typedef unsigned short uint16_t; 00040 typedef unsigned int uint32_t; 00041 00042 // Helper code - not intended to be directly called by the library users. May be changed at any time 00043 namespace internal 00044 { 00045 // Unicode constants 00046 // Leading (high) surrogates: 0xd800 - 0xdbff 00047 // Trailing (low) surrogates: 0xdc00 - 0xdfff 00048 const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 00049 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 00050 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 00051 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 00052 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 00053 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 00054 00055 // Maximum valid value for a Unicode code point 00056 const uint32_t CODE_POINT_MAX = 0x0010ffffu; 00057 00058 template<typename octet_type> 00059 inline uint8_t mask8(octet_type oc) 00060 { 00061 return static_cast<uint8_t>(0xff & oc); 00062 } 00063 template<typename u16_type> 00064 inline uint16_t mask16(u16_type oc) 00065 { 00066 return static_cast<uint16_t>(0xffff & oc); 00067 } 00068 template<typename octet_type> 00069 inline bool is_trail(octet_type oc) 00070 { 00071 return ((mask8(oc) >> 6) == 0x2); 00072 } 00073 00074 template <typename u16> 00075 inline bool is_surrogate(u16 cp) 00076 { 00077 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 00078 } 00079 00080 template <typename u32> 00081 inline bool is_code_point_valid(u32 cp) 00082 { 00083 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff); 00084 } 00085 00086 template <typename octet_iterator> 00087 inline typename std::iterator_traits<octet_iterator>::difference_type 00088 sequence_length(octet_iterator lead_it) 00089 { 00090 uint8_t lead = mask8(*lead_it); 00091 if (lead < 0x80) 00092 return 1; 00093 else if ((lead >> 5) == 0x6) 00094 return 2; 00095 else if ((lead >> 4) == 0xe) 00096 return 3; 00097 else if ((lead >> 3) == 0x1e) 00098 return 4; 00099 else 00100 return 0; 00101 } 00102 00103 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 00104 00105 template <typename octet_iterator> 00106 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00107 { 00108 uint32_t cp = mask8(*it); 00109 // Check the lead octet 00110 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; 00111 octet_difference_type length = sequence_length(it); 00112 00113 // "Shortcut" for ASCII characters 00114 if (length == 1) { 00115 if (end - it > 0) { 00116 if (code_point) 00117 *code_point = cp; 00118 ++it; 00119 return OK; 00120 } 00121 else 00122 return NOT_ENOUGH_ROOM; 00123 } 00124 00125 // Do we have enough memory? 00126 if (std::distance(it, end) < length) 00127 return NOT_ENOUGH_ROOM; 00128 00129 // Check trail octets and calculate the code point 00130 switch (length) { 00131 case 0: 00132 return INVALID_LEAD; 00133 break; 00134 case 2: 00135 if (is_trail(*(++it))) { 00136 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 00137 } 00138 else { 00139 --it; 00140 return INCOMPLETE_SEQUENCE; 00141 } 00142 break; 00143 case 3: 00144 if (is_trail(*(++it))) { 00145 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 00146 if (is_trail(*(++it))) { 00147 cp += (*it) & 0x3f; 00148 } 00149 else { 00150 std::advance(it, -2); 00151 return INCOMPLETE_SEQUENCE; 00152 } 00153 } 00154 else { 00155 --it; 00156 return INCOMPLETE_SEQUENCE; 00157 } 00158 break; 00159 case 4: 00160 if (is_trail(*(++it))) { 00161 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 00162 if (is_trail(*(++it))) { 00163 cp += (mask8(*it) << 6) & 0xfff; 00164 if (is_trail(*(++it))) { 00165 cp += (*it) & 0x3f; 00166 } 00167 else { 00168 std::advance(it, -3); 00169 return INCOMPLETE_SEQUENCE; 00170 } 00171 } 00172 else { 00173 std::advance(it, -2); 00174 return INCOMPLETE_SEQUENCE; 00175 } 00176 } 00177 else { 00178 --it; 00179 return INCOMPLETE_SEQUENCE; 00180 } 00181 break; 00182 } 00183 // Is the code point valid? 00184 if (!is_code_point_valid(cp)) { 00185 for (octet_difference_type i = 0; i < length - 1; ++i) 00186 --it; 00187 return INVALID_CODE_POINT; 00188 } 00189 00190 if (code_point) 00191 *code_point = cp; 00192 00193 if (cp < 0x80) { 00194 if (length != 1) { 00195 std::advance(it, -(length-1)); 00196 return OVERLONG_SEQUENCE; 00197 } 00198 } 00199 else if (cp < 0x800) { 00200 if (length != 2) { 00201 std::advance(it, -(length-1)); 00202 return OVERLONG_SEQUENCE; 00203 } 00204 } 00205 else if (cp < 0x10000) { 00206 if (length != 3) { 00207 std::advance(it, -(length-1)); 00208 return OVERLONG_SEQUENCE; 00209 } 00210 } 00211 00212 ++it; 00213 return OK; 00214 } 00215 00216 template <typename octet_iterator> 00217 inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 00218 return validate_next(it, end, 0); 00219 } 00220 00221 } // namespace internal 00222 00224 00225 // Byte order mark 00226 const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 00227 00228 template <typename octet_iterator> 00229 octet_iterator find_invalid(octet_iterator start, octet_iterator end) 00230 { 00231 octet_iterator result = start; 00232 while (result != end) { 00233 internal::utf_error err_code = internal::validate_next(result, end); 00234 if (err_code != internal::OK) 00235 return result; 00236 } 00237 return result; 00238 } 00239 00240 template <typename octet_iterator> 00241 inline bool is_valid(octet_iterator start, octet_iterator end) 00242 { 00243 return (find_invalid(start, end) == end); 00244 } 00245 00246 template <typename octet_iterator> 00247 inline bool is_bom (octet_iterator it) 00248 { 00249 return ( 00250 (internal::mask8(*it++)) == bom[0] && 00251 (internal::mask8(*it++)) == bom[1] && 00252 (internal::mask8(*it)) == bom[2] 00253 ); 00254 } 00255 } // namespace utf8 00256 00257 #endif // header guard 00258 00259