33 #define MAX_BUFFER 16*1024 62 switch (codec->mibEnum())
74 class KEncodingDetectorPrivate
78 QTextDecoder *m_decoder;
79 QTextCodec *m_defaultCodec;
80 QByteArray m_storeDecoderName;
87 bool m_writtingHappened : 1;
88 bool m_analyzeCalled : 1;
91 QByteArray m_bufferForDefferedEncDetection;
93 KEncodingDetectorPrivate()
94 : m_codec(QTextCodec::codecForMib(
MibLatin1))
95 , m_decoder(m_codec->makeDecoder())
96 , m_defaultCodec(m_codec)
101 , m_writtingHappened(false)
102 , m_analyzeCalled(false)
109 , m_decoder(m_codec->makeDecoder())
110 , m_defaultCodec(m_codec)
112 , m_autoDetectLanguage(script)
115 , m_writtingHappened(false)
116 , m_analyzeCalled(false)
121 ~KEncodingDetectorPrivate()
127 bool isExplicitlySpecifiedEncoding()
136 for (
int i = 0; i < size; ++i ) {
137 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
138 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
139 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
140 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
150 for (
int i = 0; i < size; ++i ) {
151 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
154 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
155 return "iso-8859-13";
158 return "iso-8859-13";
163 QByteArray charset = QByteArray();
164 for (
int i = 0; i < size; ++i ) {
165 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
166 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
176 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
180 if ( charset.isNull() )
181 charset =
"iso-8859-2";
187 if ( charset.isNull() )
188 charset =
"iso-8859-3";
190 return charset.data();
196 kWarning() <<
"KEncodingDetector: Cyr heuristics";
213 int cp1251_o_capital=0;
218 int cp1251_a_capital=0;
223 int cp1251_s_capital=0;
228 int cp1251_i_capital=0;
231 int cp1251_small_range=0;
232 int koi_small_range=0;
233 int ibm866_small_range=0;
236 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
240 ++cp1251_small_range;
244 else if (ptr[i]==0xe0)
246 else if (ptr[i]==0xe8)
248 else if (ptr[i]==0xf1)
250 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)
253 else if (ptr[i]==0xef)
255 else if (ptr[i]==0xe1)
257 else if (ptr[i]==0xe9)
259 else if (ptr[i]==0xf3)
263 else if (ptr[i]>0xbf)
267 if (ptr[i]==0xd0||ptr[i]==0xd1)
269 else if (ptr[i]==0xcf)
271 else if (ptr[i]==0xc1)
273 else if (ptr[i]==0xc9)
275 else if (ptr[i]==0xd3)
277 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)
280 else if (ptr[i]==0xce)
282 else if (ptr[i]==0xc0)
284 else if (ptr[i]==0xc8)
286 else if (ptr[i]==0xd1)
289 else if (ptr[i]>0x9f && ptr[i]<0xb0)
290 ++ibm866_small_range;
295 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
300 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
303 kWarning() <<
"Cyr Enc Detection: UTF8";
308 if (ibm866_small_range>cp1251_small_range+koi_small_range)
314 if (cp1251_st==0 && koi_st>1)
316 else if (koi_st==0 && cp1251_st>1)
319 if (cp1251_st && koi_st)
321 if (cp1251_st/koi_st>2)
323 else if (koi_st/cp1251_st>2)
329 else if (cp1251_a || koi_a)
334 else if (cp1251_o || koi_o)
339 else if (cp1251_i || koi_i)
344 else if (cp1251_s || koi_s)
347 if (cp1251_a_capital>koi_a_capital)
349 else if (cp1251_a_capital || koi_a_capital)
352 if (cp1251_o_capital>koi_o_capital)
354 else if (cp1251_o_capital || koi_o_capital)
357 if (cp1251_i_capital>koi_i_capital)
359 else if (cp1251_i_capital || koi_i_capital)
362 if (cp1251_s_capital>koi_s_capital)
364 else if (cp1251_s_capital || koi_s_capital)
367 kWarning()<<
"koi_score " << koi_score <<
" cp1251_score " << cp1251_score;
369 if (abs(koi_score-cp1251_score)<10)
372 cp1251_score=cp1251_small_range;
373 koi_score=koi_small_range;
375 if (cp1251_score>koi_score)
391 for (
int i = 0; i < size; ++i ) {
392 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
393 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
394 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
404 for (
int i = 0; i < size; ++i ) {
405 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
406 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
407 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
411 if ( ptr[ i ] == 0xDF )
412 return "iso-8859-8-i";
415 return "iso-8859-8-i";
422 switch ( kc.
guess_jp( (
const char*)ptr, size ) ) {
423 case JapaneseCode::JIS:
425 case JapaneseCode::EUC:
427 case JapaneseCode::SJIS:
429 case JapaneseCode::UTF8:
440 for (
int i = 0; i < size; ++i ) {
441 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
452 uint nonansi_count=0;
453 for (
int i=0; i<size; ++i)
458 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
462 if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
471 return "iso-8859-15";
493 if (p[1]==
'-' && p[2]==
'>')
499 if (p[1] ==
'-' && p[2] ==
'!' && p[3] ==
'>')
514 int len = str.length();
515 int pos = str.indexOf(
"encoding");
521 while (pos<len && str[pos]<=
' ')
526 if (pos>=len || str[pos] !=
'=')
531 while (pos<len && str[pos]<=
' ')
539 char quoteMark = str[pos];
540 if (quoteMark !=
'"' && quoteMark !=
'\'')
546 while (end<len && str[end]!=quoteMark)
552 encodingLength = end-pos;
561 for (
int i=1; i < len; i+=2)
563 if ((data[i]==
'\0') && (data[i-1]==
'\0'))
587 if (d->m_codec->mibEnum()!=
MibUtf8)
594 static const unsigned char highest1Bits = 0x80;
595 static const unsigned char highest2Bits = 0xC0;
596 static const unsigned char highest3Bits = 0xE0;
597 static const unsigned char highest4Bits = 0xF0;
598 static const unsigned char highest5Bits = 0xF8;
600 for (
int i=0; i<length; ++i)
602 unsigned char c = data[i];
604 if (d->m_multiByte>0)
606 if ((c & highest2Bits) == 0x80)
612 kWarning() <<
"EncDetector: Broken UTF8";
618 if ((c & highest1Bits) == 0x00)
622 if ((c & highest3Bits) == 0xC0)
629 if ((c & highest4Bits) == 0xE0)
636 if ((c & highest5Bits) == 0xF0)
642 kWarning() <<
"EncDetector:_Broken UTF8";
655 d(new KEncodingDetectorPrivate(codec,source,script))
666 d->m_autoDetectLanguage=lang;
670 return d->m_autoDetectLanguage;
680 d->m_storeDecoderName = d->m_codec->name();
681 return d->m_storeDecoderName.constData();
686 return d->m_visualRTL;
701 assert(d->m_defaultCodec);
702 d->m_bufferForDefferedEncDetection.clear();
703 d->m_writtingHappened =
false;
704 d->m_analyzeCalled =
false;
708 d->m_codec = d->m_defaultCodec;
709 d->m_decoder = d->m_codec->makeDecoder();
715 QByteArray enc(_encoding);
719 codec=d->m_defaultCodec;
737 if (d->m_codec->mibEnum()==codec->mibEnum())
755 codec = QTextCodec::codecForName(
"iso8859-8-i");
758 if(!(enc==
"iso-8859-8-i"||enc==
"iso_8859-8-i"||enc==
"csiso88598i"||enc==
"logical"))
759 d->m_visualRTL =
true;
765 d->m_decoder = d->m_codec->makeDecoder();
767 kDebug(6005) <<
"KEncodingDetector::encoding used is" << d->m_codec->name();
775 if (!d->m_analyzeCalled)
778 d->m_analyzeCalled=
true;
781 return d->m_decoder->toUnicode(data,len);
786 processNull(const_cast<char *>(data.data()),data.size());
787 if (!d->m_analyzeCalled)
789 analyze(data.data(),data.size());
790 d->m_analyzeCalled=
true;
793 return d->m_decoder->toUnicode(data);
799 kWarning() <<
"KEncodingDetector: decoding "<<len<<
" bytes";
801 if (d->m_writtingHappened)
804 kWarning() <<
"KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
807 return d->m_decoder->toUnicode(data, len);
811 if (d->m_bufferForDefferedEncDetection.isEmpty())
815 if (
analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
818 kWarning() <<
"KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
821 d->m_writtingHappened=
true;
822 return d->m_decoder->toUnicode(data, len);
827 kWarning() <<
"KEncodingDetector: begin deffer";
829 d->m_bufferForDefferedEncDetection=data;
834 d->m_bufferForDefferedEncDetection+=data;
837 bool detected =
analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
838 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
839 d->m_bufferForDefferedEncDetection.length() >
MAX_BUFFER)
841 d->m_writtingHappened=
true;
842 d->m_bufferForDefferedEncDetection.replace(
'\0',
' ');
843 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
844 d->m_bufferForDefferedEncDetection.clear();
846 kWarning() <<
"KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
858 return d->m_decoder ? d->m_decoder->hasFailure() :
false;
863 if (d->m_bufferForDefferedEncDetection.isEmpty())
866 d->m_bufferForDefferedEncDetection.replace(
'\0',
' ');
867 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
868 d->m_bufferForDefferedEncDetection.clear();
870 kWarning() <<
"KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<
" bytes "<< d->m_codec->name();
883 const uchar *udata = (
const uchar *)data;
889 const char *autoDetectedEncoding;
890 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
892 autoDetectedEncoding =
"UTF-16";
894 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
896 autoDetectedEncoding =
"UTF-8";
898 else if (c1 == 0x00 || c2 == 0x00)
906 uchar c10 = *udata++;
908 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
909 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
910 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
911 autoDetectedEncoding =
"UTF-16";
913 autoDetectedEncoding = 0;
917 autoDetectedEncoding = 0;
921 if (autoDetectedEncoding != 0)
924 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
928 d->m_decoder = d->m_codec->makeDecoder();
932 if (
is16Bit(d->m_codec) && c2==0x00)
935 char reverseUtf16[3] = {(char)0xFF, (
char)0xFE, 0x00};
936 d->m_decoder->toUnicode(reverseUtf16, 2);
946 kWarning() <<
"KEncodingDetector: UserChosenEncoding exit ";
963 const char *ptr = data;
964 const char *pEnd = data+len;
975 if (ptr[0] ==
'!' && ptr[1] ==
'-' && ptr[2] ==
'-')
983 if (ptr[0]==
'?' && ptr[1]==
'x' && ptr[2]==
'm' && ptr[3]==
'l')
985 const char *end = ptr;
986 while (*end !=
'>' && end < pEnd)
988 if (*end ==
'\0' || end == pEnd)
990 QByteArray str(ptr, end - ptr);
1002 !(((*ptr >=
'a') && (*ptr <=
'z')) ||
1003 ((*ptr >=
'A') && (*ptr <=
'Z')))
1010 const char* max=ptr+4;
1014 (((*ptr >=
'a') && (*ptr <=
'z')) ||
1015 ((*ptr >=
'A') && (*ptr <=
'Z')) ||
1016 ((*ptr >=
'0') && (*ptr <=
'9')))
1020 tmp[length] = tolower( *ptr );
1025 if (tmp[0]==
'm'&&tmp[1]==
'e'&&tmp[2]==
't'&&tmp[3]==
'a')
1028 const char* end = ptr;
1029 while(*end !=
'>' && *end !=
'\0' && end<pEnd)
1032 QByteArray str( ptr, (end-ptr)+1);
1033 str = str.toLower();
1034 const int strLength = str.length();
1038 if( (pos = str.indexOf(
"charset")) == -1)
1042 if( (pos = str.indexOf(
"=", pos)) == -1)
1049 while (pos < strLength && str[pos] <=
' ')
1053 if (pos < strLength && (str[pos] ==
'"' || str[pos] ==
'\''))
1057 while (pos < strLength && str[pos] <=
' ')
1060 if ( pos == strLength)
1064 while( endpos < strLength &&
1065 (str[endpos] !=
' ' && str[endpos] !=
'"' && str[endpos] !=
'\'' 1066 && str[endpos] !=
';' && str[endpos] !=
'>') )
1069 kDebug( 6005 ) <<
"KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1074 else if (tmp[0]==
'b'&&tmp[1]==
'o'&&tmp[2]==
'd'&&tmp[3]==
'y')
1086 kDebug( 6005 ) <<
"KEncodingDetector: using heuristics (" << strlen(data) <<
")";
1089 switch ( d->m_autoDetectLanguage)
1118 else if (d->m_defaultCodec->mibEnum()==
MibLatin1)
1149 else if (lang==
i18nc(
"@item Text character set",
"Unicode"))
1151 else if (lang==
i18nc(
"@item Text character set",
"Cyrillic"))
1153 else if (lang==
i18nc(
"@item Text character set",
"Western European"))
1155 else if (lang==
i18nc(
"@item Text character set",
"Central European"))
1157 else if (lang==
i18nc(
"@item Text character set",
"Greek"))
1159 else if (lang==
i18nc(
"@item Text character set",
"Hebrew"))
1161 else if (lang==
i18nc(
"@item Text character set",
"Turkish"))
1163 else if (lang==
i18nc(
"@item Text character set",
"Japanese"))
1165 else if (lang==
i18nc(
"@item Text character set",
"Baltic"))
1167 else if (lang==
i18nc(
"@item Text character set",
"Arabic"))
1212 return i18nc(
"@item Text character set",
"Arabic");
1215 return i18nc(
"@item Text character set",
"Baltic");
1218 return i18nc(
"@item Text character set",
"Central European");
1221 return i18nc(
"@item Text character set",
"Cyrillic");
1224 return i18nc(
"@item Text character set",
"Greek");
1227 return i18nc(
"@item Text character set",
"Hebrew");
1230 return i18nc(
"@item Text character set",
"Japanese");
1233 return i18nc(
"@item Text character set",
"Turkish");
1236 return i18nc(
"@item Text character set",
"Western European");
1239 return i18nc(
"@item Text character set",
"Chinese Traditional");
1242 return i18nc(
"@item Text character set",
"Chinese Simplified");
1245 return i18nc(
"@item Text character set",
"Korean");
1248 return i18nc(
"@item Text character set",
"Thai");
1251 return i18nc(
"@item Text character set",
"Unicode");
AutoDetectScript autoDetectLanguage() const
static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
static bool is16Bit(QTextCodec *codec)
Provides encoding detection capabilities.
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
bool visuallyOrdered() const
void setAutoDetectLanguage(AutoDetectScript)
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
static int findXMLEncoding(const QByteArray &str, int &encodingLength)
EncodingChoiceSource encodingChoiceSource() const
static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()'ed.
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
enum Type guess_jp(const char *buf, int buflen)
KCharsets * charsets()
The global charset manager.
static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
const char * encoding() const
Convenience method.
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
static void skipComment(const char *&ptr, const char *pEnd)
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;)
static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
QTextCodec * codecForName(const QString &name) const
Provided for compatibility.
bool setEncoding(const char *encoding, EncodingChoiceSource type)
QString flush()
Convenience method to be used with decodeForHtml.
static bool hasAutoDetectionForScript(AutoDetectScript)
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
void resetDecoder()
Resets the decoder.
QString decode(const char *data, int len)
The main class method.
bool analyze(const char *data, int len)
Analyze text data.
static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
static QString nameForScript(AutoDetectScript)