• Skip to content
  • Skip to link menu
KDE 4.3 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

kencodingprober.cpp

Go to the documentation of this file.
00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019     Boston, MA 02110-1301, USA.
00020 
00021 */
00022 
00023 #include "kencodingprober.h"
00024 
00025 #include "klocale.h"
00026 
00027 #include "probers/nsCharSetProber.h"
00028 #include "probers/nsUniversalDetector.h"
00029 #include "probers/ChineseGroupProber.h"
00030 #include "probers/JapaneseGroupProber.h"
00031 #include "probers/UnicodeGroupProber.h"
00032 #include "probers/nsSBCSGroupProber.h"
00033 #include "probers/nsMBCSGroupProber.h"
00034 
00035 #include <string.h>
00036 
00037 class KEncodingProberPrivate
00038 {
00039 public:
00040     KEncodingProberPrivate(): prober(NULL), mStart(true) {};
00041     ~KEncodingProberPrivate()
00042     {
00043         delete prober;
00044     }
00045     void setProberType(KEncodingProber::ProberType pType)
00046     {
00047         proberType = pType;
00048         /* handle multi-byte encodings carefully , because they're hard to detect,
00049         *   and have to use some Stastics methods.
00050         * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
00051         *   because encoding state machine can detect many such encodings.
00052         */ 
00053 
00054         delete prober;
00055 
00056         switch (proberType) {
00057             case KEncodingProber::None:
00058                 prober = NULL;
00059                 break;
00060             case KEncodingProber::Arabic:
00061             case KEncodingProber::Baltic:
00062             case KEncodingProber::CentralEuropean:
00063             case KEncodingProber::Cyrillic:
00064             case KEncodingProber::Greek:
00065             case KEncodingProber::Hebrew:
00066             case KEncodingProber::NorthernSaami:
00067             case KEncodingProber::Other:
00068             case KEncodingProber::SouthEasternEurope:
00069             case KEncodingProber::Thai:
00070             case KEncodingProber::Turkish:
00071             case KEncodingProber::WesternEuropean:
00072                 prober = new kencodingprober::nsSBCSGroupProber();
00073                 break;
00074             case KEncodingProber::ChineseSimplified:
00075             case KEncodingProber::ChineseTraditional:
00076                 prober = new kencodingprober::ChineseGroupProber();
00077                 break;
00078             case KEncodingProber::Japanese:
00079                 prober = new kencodingprober::JapaneseGroupProber();
00080                 break;
00081             case KEncodingProber::Korean:
00082                 prober = new kencodingprober::nsMBCSGroupProber();
00083                 break;
00084             case KEncodingProber::Unicode:
00085                 prober = new kencodingprober::UnicodeGroupProber();
00086                 break;
00087             case KEncodingProber::Universal:
00088                 prober = new kencodingprober::nsUniversalDetector();
00089                 break;
00090             default:
00091                 prober = NULL;
00092         }
00093     }
00094     void unicodeTest(const char *aBuf, int aLen)
00095     {
00096         if (mStart)
00097         {
00098             mStart = false;
00099             if (aLen > 3)
00100             switch (aBuf[0])
00101             {
00102                 case '\xEF':
00103                     if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00104                     // EF BB BF  UTF-8 encoded BOM
00105                     proberState = KEncodingProber::FoundIt;
00106                     break;
00107                 case '\xFE':
00108                     if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00109                         // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
00110                         proberState = KEncodingProber::FoundIt;
00111                     else if ('\xFF' == aBuf[1])
00112                         // FE FF  UTF-16, big endian BOM
00113                         proberState = KEncodingProber::FoundIt;
00114                         break;
00115                 case '\x00':
00116                     if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00117                         // 00 00 FE FF  UTF-32, big-endian BOM
00118                         proberState = KEncodingProber::FoundIt;
00119                     else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00120                         // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
00121                         proberState = KEncodingProber::FoundIt;
00122                         break;
00123                 case '\xFF':
00124                     if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00125                         // FF FE 00 00  UTF-32, little-endian BOM
00126                         proberState = KEncodingProber::FoundIt;
00127                     else if ('\xFE' == aBuf[1])
00128                         // FF FE  UTF-16, little endian BOM
00129                         proberState = KEncodingProber::FoundIt;
00130                         break;
00131             }  // switch
00132 
00133         }
00134     }
00135     KEncodingProber::ProberType proberType;
00136     KEncodingProber::ProberState proberState;
00137     kencodingprober::nsCharSetProber *prober;
00138     bool mStart;
00139 };
00140 
00141 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
00142 {
00143     setProberType(proberType);
00144 }
00145 
00146 KEncodingProber::~KEncodingProber()
00147 {
00148     delete d;
00149 }
00150 
00151 void KEncodingProber::reset()
00152 {
00153     d->proberState = KEncodingProber::Probing;
00154     d->mStart = true;
00155 }
00156 
00157 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
00158 {
00159     return feed(data.data(), data.size());
00160 }
00161 
00162 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
00163 {
00164     if (!d->prober)
00165         return d->proberState;
00166     if (d->proberState == Probing) {
00167         if (d->mStart) {
00168             d->unicodeTest(data, len);
00169             if (d->proberState == FoundIt)
00170                 return d->proberState;
00171         }
00172         d->prober->HandleData(data, len);
00173         switch (d->prober->GetState())
00174         {
00175             case kencodingprober::eNotMe:
00176                 d->proberState = NotMe;
00177                 break;
00178             case kencodingprober::eFoundIt:
00179                 d->proberState = FoundIt;
00180                 break;
00181             default:
00182                 d->proberState = Probing;
00183                 break;
00184         }
00185     }
00186 #ifdef DEBUG_PROBE
00187     d->prober->DumpStatus();
00188 #endif
00189     return d->proberState;
00190 }
00191 
00192 KEncodingProber::ProberState KEncodingProber::state() const
00193 {
00194     return d->proberState;
00195 }
00196 
00197 //DEPRECATED, do *not* use
00198 const char* KEncodingProber::encodingName() const
00199 {
00200     return strdup(encoding().constData());
00201 }
00202 
00203 QByteArray KEncodingProber::encoding() const
00204 {
00205     if (!d->prober)
00206         return QByteArray("UTF-8");
00207 
00208     return QByteArray(d->prober->GetCharSetName());
00209 }
00210 
00211 float KEncodingProber::confidence() const
00212 {
00213     if (!d->prober)
00214         return 0.0;
00215 
00216     return d->prober->GetConfidence();
00217 }
00218 
00219 KEncodingProber::ProberType KEncodingProber::proberType() const
00220 {
00221     return d->proberType;
00222 }
00223 
00224 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
00225 {
00226     d->setProberType(proberType);
00227     reset();
00228 }
00229 
00230 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
00231 {
00232     if (lang.isEmpty())
00233         return KEncodingProber::Universal;
00234     else if (lang==i18nc("@item Text character set", "Disabled"))
00235         return KEncodingProber::None;
00236     else if (lang==i18nc("@item Text character set", "Universal"))
00237         return KEncodingProber::Universal;
00238     else if (lang==i18nc("@item Text character set", "Unicode"))
00239         return KEncodingProber::Unicode;
00240     else if (lang==i18nc("@item Text character set", "Cyrillic"))
00241         return KEncodingProber::Cyrillic;
00242     else if (lang==i18nc("@item Text character set", "Western European"))
00243         return KEncodingProber::WesternEuropean;
00244     else if (lang==i18nc("@item Text character set", "Central European"))
00245         return KEncodingProber::CentralEuropean;
00246     else if (lang==i18nc("@item Text character set", "Greek"))
00247         return KEncodingProber::Greek;
00248     else if (lang==i18nc("@item Text character set", "Hebrew"))
00249         return KEncodingProber::Hebrew;
00250     else if (lang==i18nc("@item Text character set", "Turkish"))
00251         return KEncodingProber::Turkish;
00252     else if (lang==i18nc("@item Text character set", "Japanese"))
00253         return KEncodingProber::Japanese;
00254     else if (lang==i18nc("@item Text character set", "Baltic"))
00255         return KEncodingProber::Baltic;
00256     else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
00257         return KEncodingProber::ChineseTraditional;
00258     else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
00259         return KEncodingProber::ChineseSimplified;
00260     else if (lang==i18nc("@item Text character set", "Arabic"))
00261         return KEncodingProber::Arabic;
00262 
00263     return KEncodingProber::Universal;
00264 }
00265 
00266 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
00267 {
00268     switch (proberType)
00269     {
00270         case KEncodingProber::None:
00271             return i18nc("@item Text character set", "Disabled");
00272             break;
00273         case KEncodingProber::Universal:
00274             return i18nc("@item Text character set", "Universal");
00275             break;
00276         case KEncodingProber::Arabic:
00277             return i18nc("@item Text character set", "Arabic");
00278             break;
00279         case KEncodingProber::Baltic:
00280             return i18nc("@item Text character set", "Baltic");
00281             break;
00282         case KEncodingProber::CentralEuropean:
00283             return i18nc("@item Text character set", "Central European");
00284             break;
00285         case KEncodingProber::Cyrillic:
00286             return i18nc("@item Text character set", "Cyrillic");
00287             break;
00288         case KEncodingProber::Greek:
00289             return i18nc("@item Text character set", "Greek");
00290             break;
00291         case KEncodingProber::Hebrew:
00292             return i18nc("@item Text character set", "Hebrew");
00293             break;
00294         case KEncodingProber::Japanese:
00295             return i18nc("@item Text character set", "Japanese");
00296             break;
00297         case KEncodingProber::Turkish:
00298             return i18nc("@item Text character set", "Turkish");
00299             break;
00300         case KEncodingProber::WesternEuropean:
00301             return i18nc("@item Text character set", "Western European");
00302             break;
00303         case KEncodingProber::ChineseTraditional:
00304             return i18nc("@item Text character set", "Chinese Traditional");
00305             break;
00306         case KEncodingProber::ChineseSimplified:
00307             return i18nc("@item Text character set", "Chinese Simplified");
00308             break;
00309         case KEncodingProber::Korean:
00310             return i18nc("@item Text character set", "Korean");
00311             break;
00312         case KEncodingProber::Thai:
00313             return i18nc("@item Text character set", "Thai");
00314             break;
00315         case KEncodingProber::Unicode:
00316             return i18nc("@item Text character set", "Unicode");
00317             break;
00318         default:
00319             return QString();
00320         }
00321 }

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.6.1
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal