KDECore
kencodingprober.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "kencodingprober.h"
00024
00025 #include "klocale.h"
00026
00027 #include "probers/nsCharSetProber.h"
00028 #include "probers/nsUniversalDetector.h"
00029 #include "probers/ChineseGroupProber.h"
00030 #include "probers/JapaneseGroupProber.h"
00031 #include "probers/UnicodeGroupProber.h"
00032 #include "probers/nsSBCSGroupProber.h"
00033 #include "probers/nsMBCSGroupProber.h"
00034
00035 #include <string.h>
00036
00037 class KEncodingProberPrivate
00038 {
00039 public:
00040 KEncodingProberPrivate(): prober(NULL), mStart(true) {};
00041 ~KEncodingProberPrivate()
00042 {
00043 delete prober;
00044 }
00045 void setProberType(KEncodingProber::ProberType pType)
00046 {
00047 proberType = pType;
00048
00049
00050
00051
00052
00053
00054 delete prober;
00055
00056 switch (proberType) {
00057 case KEncodingProber::None:
00058 prober = NULL;
00059 break;
00060 case KEncodingProber::Arabic:
00061 case KEncodingProber::Baltic:
00062 case KEncodingProber::CentralEuropean:
00063 case KEncodingProber::Cyrillic:
00064 case KEncodingProber::Greek:
00065 case KEncodingProber::Hebrew:
00066 case KEncodingProber::NorthernSaami:
00067 case KEncodingProber::Other:
00068 case KEncodingProber::SouthEasternEurope:
00069 case KEncodingProber::Thai:
00070 case KEncodingProber::Turkish:
00071 case KEncodingProber::WesternEuropean:
00072 prober = new kencodingprober::nsSBCSGroupProber();
00073 break;
00074 case KEncodingProber::ChineseSimplified:
00075 case KEncodingProber::ChineseTraditional:
00076 prober = new kencodingprober::ChineseGroupProber();
00077 break;
00078 case KEncodingProber::Japanese:
00079 prober = new kencodingprober::JapaneseGroupProber();
00080 break;
00081 case KEncodingProber::Korean:
00082 prober = new kencodingprober::nsMBCSGroupProber();
00083 break;
00084 case KEncodingProber::Unicode:
00085 prober = new kencodingprober::UnicodeGroupProber();
00086 break;
00087 case KEncodingProber::Universal:
00088 prober = new kencodingprober::nsUniversalDetector();
00089 break;
00090 default:
00091 prober = NULL;
00092 }
00093 }
00094 void unicodeTest(const char *aBuf, int aLen)
00095 {
00096 if (mStart)
00097 {
00098 mStart = false;
00099 if (aLen > 3)
00100 switch (aBuf[0])
00101 {
00102 case '\xEF':
00103 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
00104
00105 proberState = KEncodingProber::FoundIt;
00106 break;
00107 case '\xFE':
00108 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00109
00110 proberState = KEncodingProber::FoundIt;
00111 else if ('\xFF' == aBuf[1])
00112
00113 proberState = KEncodingProber::FoundIt;
00114 break;
00115 case '\x00':
00116 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
00117
00118 proberState = KEncodingProber::FoundIt;
00119 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
00120
00121 proberState = KEncodingProber::FoundIt;
00122 break;
00123 case '\xFF':
00124 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
00125
00126 proberState = KEncodingProber::FoundIt;
00127 else if ('\xFE' == aBuf[1])
00128
00129 proberState = KEncodingProber::FoundIt;
00130 break;
00131 }
00132
00133 }
00134 }
00135 KEncodingProber::ProberType proberType;
00136 KEncodingProber::ProberState proberState;
00137 kencodingprober::nsCharSetProber *prober;
00138 bool mStart;
00139 };
00140
00141 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
00142 {
00143 setProberType(proberType);
00144 }
00145
00146 KEncodingProber::~KEncodingProber()
00147 {
00148 delete d;
00149 }
00150
00151 void KEncodingProber::reset()
00152 {
00153 d->proberState = KEncodingProber::Probing;
00154 d->mStart = true;
00155 }
00156
00157 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
00158 {
00159 return feed(data.data(), data.size());
00160 }
00161
00162 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
00163 {
00164 if (!d->prober)
00165 return d->proberState;
00166 if (d->proberState == Probing) {
00167 if (d->mStart) {
00168 d->unicodeTest(data, len);
00169 if (d->proberState == FoundIt)
00170 return d->proberState;
00171 }
00172 d->prober->HandleData(data, len);
00173 switch (d->prober->GetState())
00174 {
00175 case kencodingprober::eNotMe:
00176 d->proberState = NotMe;
00177 break;
00178 case kencodingprober::eFoundIt:
00179 d->proberState = FoundIt;
00180 break;
00181 default:
00182 d->proberState = Probing;
00183 break;
00184 }
00185 }
00186 #ifdef DEBUG_PROBE
00187 d->prober->DumpStatus();
00188 #endif
00189 return d->proberState;
00190 }
00191
00192 KEncodingProber::ProberState KEncodingProber::state() const
00193 {
00194 return d->proberState;
00195 }
00196
00197
00198 const char* KEncodingProber::encodingName() const
00199 {
00200 return strdup(encoding().constData());
00201 }
00202
00203 QByteArray KEncodingProber::encoding() const
00204 {
00205 if (!d->prober)
00206 return QByteArray("UTF-8");
00207
00208 return QByteArray(d->prober->GetCharSetName());
00209 }
00210
00211 float KEncodingProber::confidence() const
00212 {
00213 if (!d->prober)
00214 return 0.0;
00215
00216 return d->prober->GetConfidence();
00217 }
00218
00219 KEncodingProber::ProberType KEncodingProber::proberType() const
00220 {
00221 return d->proberType;
00222 }
00223
00224 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
00225 {
00226 d->setProberType(proberType);
00227 reset();
00228 }
00229
00230 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
00231 {
00232 if (lang.isEmpty())
00233 return KEncodingProber::Universal;
00234 else if (lang==i18nc("@item Text character set", "Disabled"))
00235 return KEncodingProber::None;
00236 else if (lang==i18nc("@item Text character set", "Universal"))
00237 return KEncodingProber::Universal;
00238 else if (lang==i18nc("@item Text character set", "Unicode"))
00239 return KEncodingProber::Unicode;
00240 else if (lang==i18nc("@item Text character set", "Cyrillic"))
00241 return KEncodingProber::Cyrillic;
00242 else if (lang==i18nc("@item Text character set", "Western European"))
00243 return KEncodingProber::WesternEuropean;
00244 else if (lang==i18nc("@item Text character set", "Central European"))
00245 return KEncodingProber::CentralEuropean;
00246 else if (lang==i18nc("@item Text character set", "Greek"))
00247 return KEncodingProber::Greek;
00248 else if (lang==i18nc("@item Text character set", "Hebrew"))
00249 return KEncodingProber::Hebrew;
00250 else if (lang==i18nc("@item Text character set", "Turkish"))
00251 return KEncodingProber::Turkish;
00252 else if (lang==i18nc("@item Text character set", "Japanese"))
00253 return KEncodingProber::Japanese;
00254 else if (lang==i18nc("@item Text character set", "Baltic"))
00255 return KEncodingProber::Baltic;
00256 else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
00257 return KEncodingProber::ChineseTraditional;
00258 else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
00259 return KEncodingProber::ChineseSimplified;
00260 else if (lang==i18nc("@item Text character set", "Arabic"))
00261 return KEncodingProber::Arabic;
00262
00263 return KEncodingProber::Universal;
00264 }
00265
00266 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
00267 {
00268 switch (proberType)
00269 {
00270 case KEncodingProber::None:
00271 return i18nc("@item Text character set", "Disabled");
00272 break;
00273 case KEncodingProber::Universal:
00274 return i18nc("@item Text character set", "Universal");
00275 break;
00276 case KEncodingProber::Arabic:
00277 return i18nc("@item Text character set", "Arabic");
00278 break;
00279 case KEncodingProber::Baltic:
00280 return i18nc("@item Text character set", "Baltic");
00281 break;
00282 case KEncodingProber::CentralEuropean:
00283 return i18nc("@item Text character set", "Central European");
00284 break;
00285 case KEncodingProber::Cyrillic:
00286 return i18nc("@item Text character set", "Cyrillic");
00287 break;
00288 case KEncodingProber::Greek:
00289 return i18nc("@item Text character set", "Greek");
00290 break;
00291 case KEncodingProber::Hebrew:
00292 return i18nc("@item Text character set", "Hebrew");
00293 break;
00294 case KEncodingProber::Japanese:
00295 return i18nc("@item Text character set", "Japanese");
00296 break;
00297 case KEncodingProber::Turkish:
00298 return i18nc("@item Text character set", "Turkish");
00299 break;
00300 case KEncodingProber::WesternEuropean:
00301 return i18nc("@item Text character set", "Western European");
00302 break;
00303 case KEncodingProber::ChineseTraditional:
00304 return i18nc("@item Text character set", "Chinese Traditional");
00305 break;
00306 case KEncodingProber::ChineseSimplified:
00307 return i18nc("@item Text character set", "Chinese Simplified");
00308 break;
00309 case KEncodingProber::Korean:
00310 return i18nc("@item Text character set", "Korean");
00311 break;
00312 case KEncodingProber::Thai:
00313 return i18nc("@item Text character set", "Thai");
00314 break;
00315 case KEncodingProber::Unicode:
00316 return i18nc("@item Text character set", "Unicode");
00317 break;
00318 default:
00319 return QString();
00320 }
00321 }