ucommon
|
00001 // Copyright (C) 2009-2010 David Sugar, Tycho Softworks. 00002 // 00003 // This file is part of GNU uCommon C++. 00004 // 00005 // GNU uCommon C++ is free software: you can redistribute it and/or modify 00006 // it under the terms of the GNU Lesser General Public License as published 00007 // by the Free Software Foundation, either version 3 of the License, or 00008 // (at your option) any later version. 00009 // 00010 // GNU uCommon C++ is distributed in the hope that it will be useful, 00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 // GNU Lesser General Public License for more details. 00014 // 00015 // You should have received a copy of the GNU Lesser General Public License 00016 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>. 00017 00032 #ifndef _UCOMMON_UNICODE_H_ 00033 #define _UCOMMON_UNICODE_H_ 00034 00035 #ifndef _UCOMMON_STRING_H_ 00036 #include <ucommon/string.h> 00037 #endif 00038 00039 NAMESPACE_UCOMMON 00040 00045 typedef int32_t ucs4_t; 00046 00050 typedef int16_t ucs2_t; 00051 00055 typedef void *unicode_t; 00056 00062 class __EXPORT utf8 00063 { 00064 public: 00068 static const unsigned ucsize; 00069 00073 static const char *nil; 00074 00080 static unsigned size(const char *codepoint); 00081 00087 static size_t count(const char *string); 00088 00095 static char *offset(char *string, ssize_t position); 00096 00102 static ucs4_t codepoint(const char *encoded); 00103 00109 static size_t chars(const unicode_t string); 00110 00116 static size_t chars(ucs4_t character); 00117 00124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer); 00125 00133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size); 00134 00138 static ucs4_t *udup(const char *string); 00139 00143 static ucs2_t *wdup(const char *string); 00144 00152 static const char *find(const char *string, ucs4_t character, size_t start = 0); 00153 00161 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l); 00162 00169 static unsigned ccount(const char *string, ucs4_t character); 00170 00176 static ucs4_t get(CharacterProtocol& buffer); 00177 00184 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer); 00185 }; 00186 00193 class __EXPORT UString : public String, public utf8 00194 { 00195 protected: 00199 UString(); 00200 00205 UString(strsize_t size); 00206 00211 UString(const unicode_t text); 00212 00219 UString(const char *text, strsize_t size); 00220 00227 UString(const unicode_t *text, const unicode_t *end); 00228 00234 UString(const UString& existing); 00235 00240 virtual ~UString(); 00241 00248 UString get(strsize_t codepoint, strsize_t size = 0) const; 00249 00256 size_t get(unicode_t unicode, size_t size) const; 00257 00262 void set(const unicode_t unicode); 00263 00268 void add(const unicode_t unicode); 00269 00275 ucs4_t at(int position) const; 00276 00283 inline size_t operator()(unicode_t unicode, size_t size) const 00284 {return get(unicode, size);}; 00285 00292 UString operator()(int codepoint, strsize_t size) const; 00293 00301 const char *operator()(int offset) const; 00302 00308 inline ucs4_t operator[](int position) const 00309 {return UString::at(position);}; 00310 00315 inline strsize_t count(void) const 00316 {return utf8::count(str->text);} 00317 00323 unsigned ccount(ucs4_t character) const; 00324 00331 const char *find(ucs4_t character, strsize_t start = 0) const; 00332 00339 const char *rfind(ucs4_t character, strsize_t end = npos) const; 00340 }; 00341 00347 class __EXPORT utf8_pointer 00348 { 00349 protected: 00350 uint8_t *text; 00351 00352 public: 00356 utf8_pointer(); 00357 00362 utf8_pointer(const char *string); 00363 00368 utf8_pointer(const utf8_pointer& copy); 00369 00374 utf8_pointer& operator ++(); 00375 00380 utf8_pointer& operator --(); 00381 00387 utf8_pointer& operator +=(long offset); 00388 00394 utf8_pointer& operator -=(long offset); 00395 00401 utf8_pointer operator+(long offset) const; 00402 00408 utf8_pointer operator-(long offset) const; 00409 00414 inline operator bool() const 00415 {return text != NULL;}; 00416 00421 inline bool operator!() const 00422 {return text == NULL;}; 00423 00429 ucs4_t operator[](long codepoint) const; 00430 00436 utf8_pointer& operator=(const char *string); 00437 00441 void inc(void); 00442 00446 void dec(void); 00447 00453 inline bool operator==(const char *string) const 00454 {return (const char *)text == string;}; 00455 00461 inline bool operator!=(const char *string) const 00462 {return (const char *)text != string;}; 00463 00468 inline ucs4_t operator*() const 00469 {return utf8::codepoint((const char *)text);}; 00470 00475 inline char *c_str(void) const 00476 {return (char *)text;}; 00477 00482 inline operator char*() const 00483 {return (char *)text;}; 00484 00489 inline size_t len(void) const 00490 {return utf8::count((const char *)text);}; 00491 }; 00492 00493 inline ucs4_t *strudup(const char *string) 00494 {return utf8::udup(string);} 00495 00496 inline ucs2_t *strwdup(const char *string) 00497 {return utf8::wdup(string);} 00498 00499 __EXPORT unicode_t unidup(const char *string); 00500 00501 template<> 00502 inline void dupfree<ucs2_t*>(ucs2_t *string) 00503 {::free(string);} 00504 00505 template<> 00506 inline void dupfree<ucs4_t*>(ucs4_t *string) 00507 {::free(string);} 00508 00509 template<> 00510 inline void dupfree<unicode_t>(unicode_t string) 00511 {::free(string);} 00512 00516 typedef UString ustring_t; 00517 00521 typedef utf8_pointer utf8_t; 00522 00523 END_NAMESPACE 00524 00525 #endif