Ruby  1.9.3p392(2013-02-22revision39386)
shift_jis.c
Go to the documentation of this file.
1 /**********************************************************************
2  sjis.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in the
15  * documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regint.h"
31 
32 static const int EncLen_SJIS[] = {
33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
49 };
50 
51 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
52  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
60  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
68 };
69 
70 #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
71 #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
72 
73 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
74 #define A ACCEPT
75 #define F FAILURE
76 static const signed char trans[][0x100] = {
77  { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
78  /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79  /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80  /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
81  /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
82  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
83  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
84  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
85  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
86  /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87  /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88  /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
89  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
90  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
91  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
92  /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93  /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
94  },
95  { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96  /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97  /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98  /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99  /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100  /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
101  /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102  /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
103  /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
104  /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
105  /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
106  /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
107  /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
108  /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
109  /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
110  /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
111  /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
112  }
113 };
114 #undef A
115 #undef F
116 
117 static int
118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
119 {
120  int firstbyte = *p++;
121  state_t s;
122  s = trans[0][firstbyte];
123  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
125  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
126  s = trans[s][*p++];
127  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
129 }
130 
131 static int
133 {
134  if (code < 256) {
135  if (EncLen_SJIS[(int )code] == 1)
136  return 1;
137  else
139  }
140  else if (code <= 0xffff) {
141  return 2;
142  }
143  else
145 }
146 
147 static OnigCodePoint
148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
149 {
150  int c, i, len;
151  OnigCodePoint n;
152 
153  len = enclen(enc, p, end);
154  c = *p++;
155  n = c;
156  if (len == 1) return n;
157 
158  for (i = 1; i < len; i++) {
159  if (p >= end) break;
160  c = *p++;
161  n <<= 8; n += c;
162  }
163  return n;
164 }
165 
166 static int
168 {
169  UChar *p = buf;
170 
171  if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
172  *p++ = (UChar )(code & 0xff);
173 
174 #if 0
175  if (enclen(enc, buf) != (p - buf))
176  return REGERR_INVALID_CODE_POINT_VALUE;
177 #endif
178  return (int)(p - buf);
179 }
180 
181 static int
183  const UChar** pp, const UChar* end, UChar* lower,
184  OnigEncoding enc)
185 {
186  const UChar* p = *pp;
187 
188  if (ONIGENC_IS_MBC_ASCII(p)) {
190  (*pp)++;
191  return 1;
192  }
193  else {
194  int i;
195  int len = enclen(enc, p, end);
196 
197  for (i = 0; i < len; i++) {
198  *lower++ = *p++;
199  }
200  (*pp) += len;
201  return len; /* return byte length of converted char to lower */
202  }
203 }
204 
205 #if 0
206 static int
207 is_mbc_ambiguous(OnigCaseFoldType flag,
208  const UChar** pp, const UChar* end)
209 {
210  return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
211 
212 }
213 #endif
214 
215 #if 0
216 static int
217 is_code_ctype(OnigCodePoint code, unsigned int ctype)
218 {
219  if (code < 128)
220  return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
221  else {
222  if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
223  return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
224  }
225  }
226 
227  return FALSE;
228 }
229 #endif
230 
231 static UChar*
232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
233 {
234  const UChar *p;
235  int len;
236 
237  if (s <= start) return (UChar* )s;
238  p = s;
239 
240  if (SJIS_ISMB_TRAIL(*p)) {
241  while (p > start) {
242  if (! SJIS_ISMB_FIRST(*--p)) {
243  p++;
244  break;
245  }
246  }
247  }
248  len = enclen(enc, p, end);
249  if (p + len > s) return (UChar* )p;
250  p += len;
251  return (UChar* )(p + ((s - p) & ~1));
252 }
253 
254 static int
255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
256 {
257  const UChar c = *s;
258  return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
259 }
260 
261 
262 static int PropertyInited = 0;
264 static int PropertyListNum;
265 static int PropertyListSize;
267 
268 static const OnigCodePoint CR_Hiragana[] = {
269  1,
270  0x829f, 0x82f1
271 }; /* CR_Hiragana */
272 
273 static const OnigCodePoint CR_Katakana[] = {
274  4,
275  0x00a6, 0x00af,
276  0x00b1, 0x00dd,
277  0x8340, 0x837e,
278  0x8380, 0x8396,
279 }; /* CR_Katakana */
280 
281 static int
283 {
284  int r;
285 
286  PROPERTY_LIST_ADD_PROP("hiragana", CR_Hiragana);
287  PROPERTY_LIST_ADD_PROP("katakana", CR_Katakana);
288  PropertyInited = 1;
289 
290  end:
291  return r;
292 }
293 
294 static int
296 {
297  hash_data_type ctype;
298  UChar *s, *e;
299 
301 
302  s = e = ALLOCA_N(UChar, end-p+1);
303  for (; p < end; p++) {
305  }
306 
307  if (onig_st_lookup_strend(PropertyNameTable, s, e, &ctype) == 0) {
308  return onigenc_minimum_property_name_to_ctype(enc, s, e);
309  }
310 
311  return (int)ctype;
312 }
313 
314 static int
315 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
316 {
317  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
318  if (code < 128)
319  return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
320  else {
321  if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
322  return TRUE;
323  }
324  }
325  }
326  else {
328 
329  ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
330  if (ctype >= (unsigned int )PropertyListNum)
331  return ONIGERR_TYPE_BUG;
332 
333  return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
334  }
335 
336  return FALSE;
337 }
338 
339 static int
341  const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
342 {
343  if (ctype <= ONIGENC_MAX_STD_CTYPE) {
344  return ONIG_NO_SUPPORT_CONFIG;
345  }
346  else {
347  *sb_out = 0x80;
348 
350 
351  ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
352  if (ctype >= (OnigCtype )PropertyListNum)
353  return ONIGERR_TYPE_BUG;
354 
355  *ranges = PropertyList[ctype];
356  return 0;
357  }
358 }
359 
360 OnigEncodingDefine(shift_jis, Shift_JIS) = {
361  mbc_enc_len,
362  "Shift_JIS", /* name */
363  2, /* max byte length */
364  1, /* min byte length */
366  mbc_to_code,
368  code_to_mbc,
377  0
378 };
379 /*
380  * Name: Shift_JIS
381  * MIBenum: 17
382  * Link: http://www.iana.org/assignments/character-sets
383  * Link: http://ja.wikipedia.org/wiki/Shift_JIS
384  */
385 
386 /*
387  * Name: Windows-31J
388  * MIBenum: 2024
389  * Link: http://www.iana.org/assignments/character-sets
390  * Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx
391  * Link: http://ja.wikipedia.org/wiki/Windows-31J
392  * Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm
393  *
394  * Windows Standard Character Set and its mapping to Unicode by Microsoft.
395  * Since 1.9.3, SJIS is the alias of Windows-31J because its character
396  * set is usually this one even if its mapping may differ.
397  */
398 ENC_REPLICATE("Windows-31J", "Shift_JIS")
399 ENC_ALIAS("CP932", "Windows-31J")
400 ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA. IE6 don't accept Windows-31J but csWindows31J. */
401 ENC_ALIAS("SJIS", "Windows-31J")
402 
403 /*
404  * Name: PCK
405  * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/x-2chn0/index.html
406  * Link: http://download.oracle.com/docs/cd/E19253-01/819-0606/appb-pckwarn-1/index.html
407  *
408  * Solaris's SJIS variant. Its set is Windows Standard Character Set; it
409  * consists JIS X 0201 Latin (US-ASCII), JIS X 0201 Katakana, JIS X 0208, NEC
410  * special characters, NEC-selected IBM extended characters, and IBM extended
411  * characters. Solaris's iconv seems to use SJIS-open.
412  */
413 ENC_ALIAS("PCK", "Windows-31J")
414 
415 /*
416  * Name: MacJapanese
417  * Link: http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT
418  * Link: http://ja.wikipedia.org/wiki/MacJapanese
419  */
420 ENC_REPLICATE("MacJapanese", "Shift_JIS")
421 ENC_ALIAS("MacJapan", "MacJapanese")
422