Ruby  1.9.3p392(2013-02-22revision39386)
utf_16be.c
Go to the documentation of this file.
1 /**********************************************************************
2  utf_16be.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regenc.h"
31 
32 #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8)
33 #define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc)
34 #define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8)
35 
36 static const int EncLen_UTF16[] = {
37  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53 };
54 
55 static int
56 utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
57  OnigEncoding enc ARG_UNUSED)
58 {
59  int byte = p[0];
60  if (!UTF16_IS_SURROGATE(byte)) {
61  if (2 <= e-p)
63  else
65  }
66  if (UTF16_IS_SURROGATE_FIRST(byte)) {
67  switch (e-p) {
68  case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
69  case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
70  case 3:
71  if (UTF16_IS_SURROGATE_SECOND(p[2]))
73  break;
74  default:
75  if (UTF16_IS_SURROGATE_SECOND(p[2]))
77  break;
78  }
79  }
81 }
82 
83 static int
84 utf16be_is_mbc_newline(const UChar* p, const UChar* end,
85  OnigEncoding enc)
86 {
87  if (p + 1 < end) {
88  if (*(p+1) == 0x0a && *p == 0x00)
89  return 1;
90 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
91  if ((
92 #ifndef USE_CRNL_AS_LINE_TERMINATOR
93  *(p+1) == 0x0d ||
94 #endif
95  *(p+1) == 0x85) && *p == 0x00)
96  return 1;
97  if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
98  return 1;
99 #endif
100  }
101  return 0;
102 }
103 
104 static OnigCodePoint
105 utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
106  OnigEncoding enc)
107 {
108  OnigCodePoint code;
109 
110  if (UTF16_IS_SURROGATE_FIRST(*p)) {
111  code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10)
112  + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000;
113  }
114  else {
115  code = p[0] * 256 + p[1];
116  }
117  return code;
118 }
119 
120 static int
122  OnigEncoding enc)
123 {
124  return (code > 0xffff ? 4 : 2);
125 }
126 
127 static int
129  OnigEncoding enc)
130 {
131  UChar* p = buf;
132 
133  if (code > 0xffff) {
134  unsigned int high = (code >> 10) + 0xD7C0;
135  unsigned int low = (code & 0x3FF) + 0xDC00;
136  *p++ = (high >> 8) & 0xFF;
137  *p++ = high & 0xFF;
138  *p++ = (low >> 8) & 0xFF;
139  *p++ = low & 0xFF;
140  return 4;
141  }
142  else {
143  *p++ = (UChar )((code & 0xff00) >> 8);
144  *p++ = (UChar )(code & 0xff);
145  return 2;
146  }
147 }
148 
149 static int
151  const UChar** pp, const UChar* end, UChar* fold,
152  OnigEncoding enc)
153 {
154  const UChar* p = *pp;
155 
156  if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
157  p++;
158 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
159  if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
160  if (*p == 0x49) {
161  *fold++ = 0x01;
162  *fold = 0x31;
163  (*pp) += 2;
164  return 2;
165  }
166  }
167 #endif
168 
169  *fold++ = 0;
171  *pp += 2;
172  return 2;
173  }
174  else
175  return onigenc_unicode_mbc_case_fold(enc, flag,
176  pp, end, fold);
177 }
178 
179 #if 0
180 static int
181 utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
182 {
183  const UChar* p = *pp;
184 
185  (*pp) += EncLen_UTF16[*p];
186 
187  if (*p == 0) {
188  int c, v;
189 
190  p++;
191  if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
192  return TRUE;
193  }
194 
195  c = *p;
196  v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
197  (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
198 
199  if ((v | BIT_CTYPE_LOWER) != 0) {
200  /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
201  if (c >= 0xaa && c <= 0xba)
202  return FALSE;
203  else
204  return TRUE;
205  }
206  return (v != 0 ? TRUE : FALSE);
207  }
208 
209  return FALSE;
210 }
211 #endif
212 
213 static UChar*
214 utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
215  OnigEncoding enc ARG_UNUSED)
216 {
217  if (s <= start) return (UChar* )s;
218 
219  if ((s - start) % 2 == 1) {
220  s--;
221  }
222 
223  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
224  s -= 2;
225 
226  return (UChar* )s;
227 }
228 
229 static int
231  const OnigUChar* p, const OnigUChar* end,
232  OnigCaseFoldCodeItem items[],
233  OnigEncoding enc)
234 {
236  flag, p, end, items);
237 }
238 
239 OnigEncodingDefine(utf_16be, UTF_16BE) = {
241  "UTF-16BE", /* name */
242  4, /* max byte length */
243  2, /* min byte length */
256 };
257 ENC_ALIAS("UCS-2BE", "UTF-16BE")
258