From 303e99050b18bd1d0ecea1fe269c9907a1201ff1 Mon Sep 17 00:00:00 2001 From: Rainer Kottenhoff Date: Thu, 5 Mar 2026 08:52:25 +0100 Subject: [PATCH] fix: utf-8 affinity for uchardet uncertainty (lower than 66%) --- src/EncodingDetection.cpp | 23 ++++++++++++------ src/uchardet/uchardet/test_moved.readme | 1 + .../encoding/uchardet}/CMakeLists.txt | 0 .../encoding/uchardet}/ar/iso-8859-6.txt | 0 .../encoding/uchardet}/ar/utf-8.txt | 0 .../encoding/uchardet}/ar/windows-1256.txt | 0 .../encoding/uchardet}/bg/windows-1251.txt | 0 .../encoding/uchardet}/cs/ibm852.txt | 0 .../encoding/uchardet}/cs/iso-8859-2.txt | 0 .../uchardet}/cs/mac-centraleurope.txt | 0 .../encoding/uchardet}/cs/utf-8.txt | 0 .../encoding/uchardet}/cs/windows-1250.txt | 0 .../encoding/uchardet}/da/ibm865.txt | 0 .../encoding/uchardet}/da/iso-8859-1.txt | 0 .../encoding/uchardet}/da/iso-8859-15.txt | 0 .../encoding/uchardet}/da/utf-8.txt | 0 .../encoding/uchardet}/da/windows-1252.txt | 0 .../encoding/uchardet}/de/iso-8859-1.txt | 0 .../encoding/uchardet}/de/windows-1252.txt | 0 .../encoding/uchardet}/el/iso-8859-7.txt | 0 .../encoding/uchardet}/el/utf-8.txt | 0 .../encoding/uchardet}/el/windows-1253.txt | 0 .../encoding/uchardet}/en/ascii.txt | 0 .../encoding/uchardet}/eo/iso-8859-3.txt | 0 .../encoding/uchardet}/es/iso-8859-1.txt | 0 .../encoding/uchardet}/es/iso-8859-15.txt | 0 .../encoding/uchardet}/es/utf-8.txt | 0 .../encoding/uchardet}/es/windows-1252.txt | 0 .../encoding/uchardet}/et/iso-8859-13.txt | 0 .../encoding/uchardet}/et/iso-8859-15.txt | 0 .../encoding/uchardet}/et/iso-8859-4.txt | 0 .../encoding/uchardet}/et/utf-8.txt | 0 .../encoding/uchardet}/et/windows-1252.txt | 0 .../encoding/uchardet}/et/windows-1257.txt | 0 .../encoding/uchardet}/fi/iso-8859-1.txt | 0 .../encoding/uchardet}/fi/utf-8.txt | 0 .../encoding/uchardet}/fr/iso-8859-1.txt | 0 .../encoding/uchardet}/fr/iso-8859-15.txt | 0 .../encoding/uchardet}/fr/utf-16.be | Bin .../encoding/uchardet}/fr/utf-32.le | Bin .../encoding/uchardet}/fr/utf-8.txt | 0 .../encoding/uchardet}/fr/windows-1252.txt | 0 .../encoding/uchardet}/ga/iso-8859-1.txt | 0 .../encoding/uchardet}/ga/utf-8.txt | 0 .../encoding/uchardet}/ga/windows-1252.txt | 0 .../encoding/uchardet}/he/iso-8859-8.txt | 0 .../encoding/uchardet}/he/utf-8.txt | 0 .../encoding/uchardet}/he/windows-1255.txt | 0 .../encoding/uchardet}/hr/ibm852.txt | 0 .../encoding/uchardet}/hr/iso-8859-13.txt | 0 .../encoding/uchardet}/hr/iso-8859-16.txt | 0 .../encoding/uchardet}/hr/iso-8859-2.txt | 0 .../uchardet}/hr/mac-centraleurope.txt | 0 .../encoding/uchardet}/hr/utf-8.txt | 0 .../encoding/uchardet}/hr/windows-1250.txt | 0 .../encoding/uchardet}/hu/iso-8859-2.txt | 0 .../encoding/uchardet}/hu/windows-1250.txt | 0 .../encoding/uchardet}/it/iso-8859-1.txt | 0 .../encoding/uchardet}/it/utf-8.txt | 0 .../encoding/uchardet}/ja/euc-jp.txt | 0 .../encoding/uchardet}/ja/iso-2022-jp.txt | 0 .../encoding/uchardet}/ja/shift_jis.txt | 0 .../encoding/uchardet}/ja/utf-16be.txt | Bin .../encoding/uchardet}/ja/utf-16le.txt | Bin .../encoding/uchardet}/ja/utf-8.txt | 0 .../encoding/uchardet}/ko/iso-2022-kr.txt | 0 .../test_files/encoding/uchardet}/ko/uhc.smi | 0 .../encoding/uchardet}/ko/utf-16.le | Bin .../encoding/uchardet}/ko/utf-32.be | Bin .../encoding/uchardet}/ko/utf-8.txt | 0 .../encoding/uchardet}/lt/iso-8859-10.txt | 0 .../encoding/uchardet}/lt/iso-8859-13.txt | 0 .../encoding/uchardet}/lt/iso-8859-4.txt | 0 .../encoding/uchardet}/lt/utf-8.txt | 0 .../encoding/uchardet}/lv/iso-8859-10.txt | 0 .../encoding/uchardet}/lv/iso-8859-13.txt | 0 .../encoding/uchardet}/lv/iso-8859-4.txt | 0 .../encoding/uchardet}/lv/utf-8.txt | 0 .../encoding/uchardet}/mt/iso-8859-3.txt | 0 .../encoding/uchardet}/mt/utf-8.txt | 0 .../encoding/uchardet}/no/ibm865.txt | 0 .../encoding/uchardet}/no/iso-8859-1.txt | 0 .../encoding/uchardet}/no/iso-8859-15.txt | 0 .../encoding/uchardet}/no/utf-8.txt | 0 .../encoding/uchardet}/no/windows-1252.txt | 0 .../encoding/uchardet}/pl/ibm852.txt | 0 .../encoding/uchardet}/pl/iso-8859-13.txt | 0 .../encoding/uchardet}/pl/iso-8859-16.txt | 0 .../encoding/uchardet}/pl/iso-8859-2.txt | 0 .../uchardet}/pl/mac-centraleurope.txt | 0 .../encoding/uchardet}/pl/utf-8.txt | 0 .../encoding/uchardet}/pl/windows-1250.txt | 0 .../encoding/uchardet}/pt/iso-8859-1.txt | 0 .../encoding/uchardet}/pt/utf-8.txt | 0 .../encoding/uchardet}/ro/ibm852.txt | 0 .../encoding/uchardet}/ro/iso-8859-16.txt | 0 .../encoding/uchardet}/ro/utf-8.txt | 0 .../encoding/uchardet}/ro/windows-1250.txt | 0 .../encoding/uchardet}/ru/ibm855.txt | 0 .../encoding/uchardet}/ru/ibm866.txt | 0 .../encoding/uchardet}/ru/iso-8859-5.txt | 0 .../encoding/uchardet}/ru/koi8-r.txt | 0 .../encoding/uchardet}/ru/mac-cyrillic.txt | 0 .../encoding/uchardet}/ru/windows-1251.txt | 0 .../encoding/uchardet}/sk/ibm852.txt | 0 .../encoding/uchardet}/sk/iso-8859-2.txt | 0 .../uchardet}/sk/mac-centraleurope.txt | 0 .../encoding/uchardet}/sk/utf-8.txt | 0 .../encoding/uchardet}/sk/windows-1250.txt | 0 .../encoding/uchardet}/sl/ibm852.txt | 0 .../encoding/uchardet}/sl/iso-8859-16.txt | 0 .../encoding/uchardet}/sl/iso-8859-2.txt | 0 .../uchardet}/sl/mac-centraleurope.txt | 0 .../encoding/uchardet}/sl/utf-8.txt | 0 .../encoding/uchardet}/sl/windows-1250.txt | 0 .../encoding/uchardet}/sv/iso-8859-1.txt | 0 .../encoding/uchardet}/sv/utf-8.txt | 0 .../encoding/uchardet}/sv/windows-1252.txt | 0 .../encoding/uchardet}/th/iso-8859-11.txt | 0 .../encoding/uchardet}/th/tis-620.txt | 0 .../encoding/uchardet}/th/utf-8.txt | 0 .../encoding/uchardet}/tr/iso-8859-3.txt | 0 .../encoding/uchardet}/tr/iso-8859-9.txt | 0 .../encoding/uchardet}/uchardet-tests.c | 0 .../encoding/uchardet}/vi/utf-8.txt | 0 .../encoding/uchardet}/vi/viscii.txt | 0 .../encoding/uchardet}/vi/windows-1258.txt | 0 .../test_files/encoding/uchardet}/zh/big5.txt | 0 .../encoding/uchardet}/zh/euc-tw.txt | 0 .../encoding/uchardet}/zh/gb18030.txt | 0 .../encoding/uchardet}/zh/utf-8.txt | 0 131 files changed, 17 insertions(+), 7 deletions(-) create mode 100644 src/uchardet/uchardet/test_moved.readme rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/CMakeLists.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ar/iso-8859-6.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ar/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ar/windows-1256.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/bg/windows-1251.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/cs/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/cs/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/cs/mac-centraleurope.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/cs/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/cs/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/da/ibm865.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/da/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/da/iso-8859-15.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/da/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/da/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/de/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/de/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/el/iso-8859-7.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/el/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/el/windows-1253.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/en/ascii.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/eo/iso-8859-3.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/es/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/es/iso-8859-15.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/es/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/es/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/iso-8859-13.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/iso-8859-15.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/iso-8859-4.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/et/windows-1257.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fi/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fi/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/iso-8859-15.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/utf-16.be (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/utf-32.le (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/fr/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ga/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ga/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ga/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/he/iso-8859-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/he/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/he/windows-1255.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/iso-8859-13.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/iso-8859-16.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/mac-centraleurope.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hr/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hu/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/hu/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/it/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/it/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/euc-jp.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/iso-2022-jp.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/shift_jis.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/utf-16be.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/utf-16le.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ja/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ko/iso-2022-kr.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ko/uhc.smi (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ko/utf-16.le (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ko/utf-32.be (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ko/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lt/iso-8859-10.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lt/iso-8859-13.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lt/iso-8859-4.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lt/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lv/iso-8859-10.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lv/iso-8859-13.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lv/iso-8859-4.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/lv/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/mt/iso-8859-3.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/mt/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/no/ibm865.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/no/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/no/iso-8859-15.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/no/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/no/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/iso-8859-13.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/iso-8859-16.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/mac-centraleurope.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pl/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pt/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/pt/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ro/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ro/iso-8859-16.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ro/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ro/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/ibm855.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/ibm866.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/iso-8859-5.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/koi8-r.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/mac-cyrillic.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/ru/windows-1251.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sk/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sk/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sk/mac-centraleurope.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sk/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sk/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/ibm852.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/iso-8859-16.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/iso-8859-2.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/mac-centraleurope.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sl/windows-1250.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sv/iso-8859-1.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sv/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/sv/windows-1252.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/th/iso-8859-11.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/th/tis-620.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/th/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/tr/iso-8859-3.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/tr/iso-8859-9.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/uchardet-tests.c (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/vi/utf-8.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/vi/viscii.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/vi/windows-1258.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/zh/big5.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/zh/euc-tw.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/zh/gb18030.txt (100%) rename {src/uchardet/uchardet/test => test/test_files/encoding/uchardet}/zh/utf-8.txt (100%) diff --git a/src/EncodingDetection.cpp b/src/EncodingDetection.cpp index 3bb5e3009..05eb4eed7 100644 --- a/src/EncodingDetection.cpp +++ b/src/EncodingDetection.cpp @@ -647,11 +647,7 @@ constexpr cpi_enc_t _MapStdEncodingString2CPI(const char* encStrg, float* pConfi bool bBOM = false; bool bReverse = false; cpi_enc_t const cpi = GetUnicodeEncoding(text, len, &bBOM, &bReverse); - if (!Encoding_IsNONE(cpiEncoding)) { - cpiEncoding = cpi; - } else { - cpiEncoding = bBOM ? (bReverse ? CPI_UNICODEBE : CPI_UNICODE) : (bReverse ? CPI_UNICODEBE : CPI_UNICODE); - } + cpiEncoding = cpi; } // check for default ANSI @@ -814,6 +810,7 @@ void Encoding_AnalyzeText(const char* const text, const size_t len, // --- re-mapping UCD ---- switch (Encoding_GetCodePage(cpiEncoding_UCD)) { + case 28591: // ISO 8859 - 1 mapped to Windows - 1252 (HTML5 Standard advice) cpiEncoding_UCD = Encoding_GetByCodePage(1252); // auto detect default ANSI (!) break; @@ -1303,6 +1300,7 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD } if (!IS_ENC_ENFORCED() || bForceEncDetection) { + if (!bSkipANSICPDetection) { // --------------------------------------------------------------------------- Encoding_AnalyzeText(lpData, cbNbytes4Analysis, &encDetRes, iAnalyzeHint); @@ -1375,11 +1373,22 @@ extern "C" ENC_DET_T Encoding_DetectEncoding(const HPATHL hpath, const char* lpD } else if (Encoding_IsValid(encDetRes.analyzedEncoding) && (encDetRes.bIsAnalysisReliable || !Settings.UseReliableCEDonly)) { - encDetRes.Encoding = encDetRes.analyzedEncoding; + if (!encDetRes.bIsAnalysisReliable && !Encoding_IsUTF8(encDetRes.analyzedEncoding) && encDetRes.bValidUTF8) { + encDetRes.Encoding = CPI_UTF8; // unreliable non-UTF-8 guess, but data is valid UTF-8 + } else { + encDetRes.Encoding = encDetRes.analyzedEncoding; + } + } + else if (!encDetRes.bIsAnalysisReliable && Encoding_IsValid(encDetRes.analyzedEncoding)) + { + // UCHARDET below confidence threshold (UseReliableCEDonly is true) + encDetRes.Encoding = encDetRes.bValidUTF8 ? CPI_UTF8 : CPI_ANSI_DEFAULT; } else if (Encoding_IsUNICODE(encDetRes.unicodeAnalysis) && (iConfidence > 66)) { - encDetRes.Encoding = encDetRes.analyzedEncoding; // (1) rely on analyzed encoding + // unicodeAnalysis (IsTextUnicode) confirms Unicode structure, + // iConfidence is from UCHARDET analysis — use analyzedEncoding (intentional) + encDetRes.Encoding = encDetRes.analyzedEncoding; } else if (Encoding_IsValid(Encoding_SrcWeak(CPI_GET))) { diff --git a/src/uchardet/uchardet/test_moved.readme b/src/uchardet/uchardet/test_moved.readme new file mode 100644 index 000000000..7e5c8f255 --- /dev/null +++ b/src/uchardet/uchardet/test_moved.readme @@ -0,0 +1 @@ +tests are moved to Notepad3\test\test_files\encoding\uchardet\ diff --git a/src/uchardet/uchardet/test/CMakeLists.txt b/test/test_files/encoding/uchardet/CMakeLists.txt similarity index 100% rename from src/uchardet/uchardet/test/CMakeLists.txt rename to test/test_files/encoding/uchardet/CMakeLists.txt diff --git a/src/uchardet/uchardet/test/ar/iso-8859-6.txt b/test/test_files/encoding/uchardet/ar/iso-8859-6.txt similarity index 100% rename from src/uchardet/uchardet/test/ar/iso-8859-6.txt rename to test/test_files/encoding/uchardet/ar/iso-8859-6.txt diff --git a/src/uchardet/uchardet/test/ar/utf-8.txt b/test/test_files/encoding/uchardet/ar/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/ar/utf-8.txt rename to test/test_files/encoding/uchardet/ar/utf-8.txt diff --git a/src/uchardet/uchardet/test/ar/windows-1256.txt b/test/test_files/encoding/uchardet/ar/windows-1256.txt similarity index 100% rename from src/uchardet/uchardet/test/ar/windows-1256.txt rename to test/test_files/encoding/uchardet/ar/windows-1256.txt diff --git a/src/uchardet/uchardet/test/bg/windows-1251.txt b/test/test_files/encoding/uchardet/bg/windows-1251.txt similarity index 100% rename from src/uchardet/uchardet/test/bg/windows-1251.txt rename to test/test_files/encoding/uchardet/bg/windows-1251.txt diff --git a/src/uchardet/uchardet/test/cs/ibm852.txt b/test/test_files/encoding/uchardet/cs/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/cs/ibm852.txt rename to test/test_files/encoding/uchardet/cs/ibm852.txt diff --git a/src/uchardet/uchardet/test/cs/iso-8859-2.txt b/test/test_files/encoding/uchardet/cs/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/cs/iso-8859-2.txt rename to test/test_files/encoding/uchardet/cs/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/cs/mac-centraleurope.txt b/test/test_files/encoding/uchardet/cs/mac-centraleurope.txt similarity index 100% rename from src/uchardet/uchardet/test/cs/mac-centraleurope.txt rename to test/test_files/encoding/uchardet/cs/mac-centraleurope.txt diff --git a/src/uchardet/uchardet/test/cs/utf-8.txt b/test/test_files/encoding/uchardet/cs/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/cs/utf-8.txt rename to test/test_files/encoding/uchardet/cs/utf-8.txt diff --git a/src/uchardet/uchardet/test/cs/windows-1250.txt b/test/test_files/encoding/uchardet/cs/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/cs/windows-1250.txt rename to test/test_files/encoding/uchardet/cs/windows-1250.txt diff --git a/src/uchardet/uchardet/test/da/ibm865.txt b/test/test_files/encoding/uchardet/da/ibm865.txt similarity index 100% rename from src/uchardet/uchardet/test/da/ibm865.txt rename to test/test_files/encoding/uchardet/da/ibm865.txt diff --git a/src/uchardet/uchardet/test/da/iso-8859-1.txt b/test/test_files/encoding/uchardet/da/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/da/iso-8859-1.txt rename to test/test_files/encoding/uchardet/da/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/da/iso-8859-15.txt b/test/test_files/encoding/uchardet/da/iso-8859-15.txt similarity index 100% rename from src/uchardet/uchardet/test/da/iso-8859-15.txt rename to test/test_files/encoding/uchardet/da/iso-8859-15.txt diff --git a/src/uchardet/uchardet/test/da/utf-8.txt b/test/test_files/encoding/uchardet/da/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/da/utf-8.txt rename to test/test_files/encoding/uchardet/da/utf-8.txt diff --git a/src/uchardet/uchardet/test/da/windows-1252.txt b/test/test_files/encoding/uchardet/da/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/da/windows-1252.txt rename to test/test_files/encoding/uchardet/da/windows-1252.txt diff --git a/src/uchardet/uchardet/test/de/iso-8859-1.txt b/test/test_files/encoding/uchardet/de/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/de/iso-8859-1.txt rename to test/test_files/encoding/uchardet/de/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/de/windows-1252.txt b/test/test_files/encoding/uchardet/de/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/de/windows-1252.txt rename to test/test_files/encoding/uchardet/de/windows-1252.txt diff --git a/src/uchardet/uchardet/test/el/iso-8859-7.txt b/test/test_files/encoding/uchardet/el/iso-8859-7.txt similarity index 100% rename from src/uchardet/uchardet/test/el/iso-8859-7.txt rename to test/test_files/encoding/uchardet/el/iso-8859-7.txt diff --git a/src/uchardet/uchardet/test/el/utf-8.txt b/test/test_files/encoding/uchardet/el/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/el/utf-8.txt rename to test/test_files/encoding/uchardet/el/utf-8.txt diff --git a/src/uchardet/uchardet/test/el/windows-1253.txt b/test/test_files/encoding/uchardet/el/windows-1253.txt similarity index 100% rename from src/uchardet/uchardet/test/el/windows-1253.txt rename to test/test_files/encoding/uchardet/el/windows-1253.txt diff --git a/src/uchardet/uchardet/test/en/ascii.txt b/test/test_files/encoding/uchardet/en/ascii.txt similarity index 100% rename from src/uchardet/uchardet/test/en/ascii.txt rename to test/test_files/encoding/uchardet/en/ascii.txt diff --git a/src/uchardet/uchardet/test/eo/iso-8859-3.txt b/test/test_files/encoding/uchardet/eo/iso-8859-3.txt similarity index 100% rename from src/uchardet/uchardet/test/eo/iso-8859-3.txt rename to test/test_files/encoding/uchardet/eo/iso-8859-3.txt diff --git a/src/uchardet/uchardet/test/es/iso-8859-1.txt b/test/test_files/encoding/uchardet/es/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/es/iso-8859-1.txt rename to test/test_files/encoding/uchardet/es/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/es/iso-8859-15.txt b/test/test_files/encoding/uchardet/es/iso-8859-15.txt similarity index 100% rename from src/uchardet/uchardet/test/es/iso-8859-15.txt rename to test/test_files/encoding/uchardet/es/iso-8859-15.txt diff --git a/src/uchardet/uchardet/test/es/utf-8.txt b/test/test_files/encoding/uchardet/es/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/es/utf-8.txt rename to test/test_files/encoding/uchardet/es/utf-8.txt diff --git a/src/uchardet/uchardet/test/es/windows-1252.txt b/test/test_files/encoding/uchardet/es/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/es/windows-1252.txt rename to test/test_files/encoding/uchardet/es/windows-1252.txt diff --git a/src/uchardet/uchardet/test/et/iso-8859-13.txt b/test/test_files/encoding/uchardet/et/iso-8859-13.txt similarity index 100% rename from src/uchardet/uchardet/test/et/iso-8859-13.txt rename to test/test_files/encoding/uchardet/et/iso-8859-13.txt diff --git a/src/uchardet/uchardet/test/et/iso-8859-15.txt b/test/test_files/encoding/uchardet/et/iso-8859-15.txt similarity index 100% rename from src/uchardet/uchardet/test/et/iso-8859-15.txt rename to test/test_files/encoding/uchardet/et/iso-8859-15.txt diff --git a/src/uchardet/uchardet/test/et/iso-8859-4.txt b/test/test_files/encoding/uchardet/et/iso-8859-4.txt similarity index 100% rename from src/uchardet/uchardet/test/et/iso-8859-4.txt rename to test/test_files/encoding/uchardet/et/iso-8859-4.txt diff --git a/src/uchardet/uchardet/test/et/utf-8.txt b/test/test_files/encoding/uchardet/et/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/et/utf-8.txt rename to test/test_files/encoding/uchardet/et/utf-8.txt diff --git a/src/uchardet/uchardet/test/et/windows-1252.txt b/test/test_files/encoding/uchardet/et/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/et/windows-1252.txt rename to test/test_files/encoding/uchardet/et/windows-1252.txt diff --git a/src/uchardet/uchardet/test/et/windows-1257.txt b/test/test_files/encoding/uchardet/et/windows-1257.txt similarity index 100% rename from src/uchardet/uchardet/test/et/windows-1257.txt rename to test/test_files/encoding/uchardet/et/windows-1257.txt diff --git a/src/uchardet/uchardet/test/fi/iso-8859-1.txt b/test/test_files/encoding/uchardet/fi/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/fi/iso-8859-1.txt rename to test/test_files/encoding/uchardet/fi/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/fi/utf-8.txt b/test/test_files/encoding/uchardet/fi/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/fi/utf-8.txt rename to test/test_files/encoding/uchardet/fi/utf-8.txt diff --git a/src/uchardet/uchardet/test/fr/iso-8859-1.txt b/test/test_files/encoding/uchardet/fr/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/fr/iso-8859-1.txt rename to test/test_files/encoding/uchardet/fr/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/fr/iso-8859-15.txt b/test/test_files/encoding/uchardet/fr/iso-8859-15.txt similarity index 100% rename from src/uchardet/uchardet/test/fr/iso-8859-15.txt rename to test/test_files/encoding/uchardet/fr/iso-8859-15.txt diff --git a/src/uchardet/uchardet/test/fr/utf-16.be b/test/test_files/encoding/uchardet/fr/utf-16.be similarity index 100% rename from src/uchardet/uchardet/test/fr/utf-16.be rename to test/test_files/encoding/uchardet/fr/utf-16.be diff --git a/src/uchardet/uchardet/test/fr/utf-32.le b/test/test_files/encoding/uchardet/fr/utf-32.le similarity index 100% rename from src/uchardet/uchardet/test/fr/utf-32.le rename to test/test_files/encoding/uchardet/fr/utf-32.le diff --git a/src/uchardet/uchardet/test/fr/utf-8.txt b/test/test_files/encoding/uchardet/fr/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/fr/utf-8.txt rename to test/test_files/encoding/uchardet/fr/utf-8.txt diff --git a/src/uchardet/uchardet/test/fr/windows-1252.txt b/test/test_files/encoding/uchardet/fr/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/fr/windows-1252.txt rename to test/test_files/encoding/uchardet/fr/windows-1252.txt diff --git a/src/uchardet/uchardet/test/ga/iso-8859-1.txt b/test/test_files/encoding/uchardet/ga/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/ga/iso-8859-1.txt rename to test/test_files/encoding/uchardet/ga/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/ga/utf-8.txt b/test/test_files/encoding/uchardet/ga/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/ga/utf-8.txt rename to test/test_files/encoding/uchardet/ga/utf-8.txt diff --git a/src/uchardet/uchardet/test/ga/windows-1252.txt b/test/test_files/encoding/uchardet/ga/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/ga/windows-1252.txt rename to test/test_files/encoding/uchardet/ga/windows-1252.txt diff --git a/src/uchardet/uchardet/test/he/iso-8859-8.txt b/test/test_files/encoding/uchardet/he/iso-8859-8.txt similarity index 100% rename from src/uchardet/uchardet/test/he/iso-8859-8.txt rename to test/test_files/encoding/uchardet/he/iso-8859-8.txt diff --git a/src/uchardet/uchardet/test/he/utf-8.txt b/test/test_files/encoding/uchardet/he/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/he/utf-8.txt rename to test/test_files/encoding/uchardet/he/utf-8.txt diff --git a/src/uchardet/uchardet/test/he/windows-1255.txt b/test/test_files/encoding/uchardet/he/windows-1255.txt similarity index 100% rename from src/uchardet/uchardet/test/he/windows-1255.txt rename to test/test_files/encoding/uchardet/he/windows-1255.txt diff --git a/src/uchardet/uchardet/test/hr/ibm852.txt b/test/test_files/encoding/uchardet/hr/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/ibm852.txt rename to test/test_files/encoding/uchardet/hr/ibm852.txt diff --git a/src/uchardet/uchardet/test/hr/iso-8859-13.txt b/test/test_files/encoding/uchardet/hr/iso-8859-13.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/iso-8859-13.txt rename to test/test_files/encoding/uchardet/hr/iso-8859-13.txt diff --git a/src/uchardet/uchardet/test/hr/iso-8859-16.txt b/test/test_files/encoding/uchardet/hr/iso-8859-16.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/iso-8859-16.txt rename to test/test_files/encoding/uchardet/hr/iso-8859-16.txt diff --git a/src/uchardet/uchardet/test/hr/iso-8859-2.txt b/test/test_files/encoding/uchardet/hr/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/iso-8859-2.txt rename to test/test_files/encoding/uchardet/hr/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/hr/mac-centraleurope.txt b/test/test_files/encoding/uchardet/hr/mac-centraleurope.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/mac-centraleurope.txt rename to test/test_files/encoding/uchardet/hr/mac-centraleurope.txt diff --git a/src/uchardet/uchardet/test/hr/utf-8.txt b/test/test_files/encoding/uchardet/hr/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/utf-8.txt rename to test/test_files/encoding/uchardet/hr/utf-8.txt diff --git a/src/uchardet/uchardet/test/hr/windows-1250.txt b/test/test_files/encoding/uchardet/hr/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/hr/windows-1250.txt rename to test/test_files/encoding/uchardet/hr/windows-1250.txt diff --git a/src/uchardet/uchardet/test/hu/iso-8859-2.txt b/test/test_files/encoding/uchardet/hu/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/hu/iso-8859-2.txt rename to test/test_files/encoding/uchardet/hu/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/hu/windows-1250.txt b/test/test_files/encoding/uchardet/hu/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/hu/windows-1250.txt rename to test/test_files/encoding/uchardet/hu/windows-1250.txt diff --git a/src/uchardet/uchardet/test/it/iso-8859-1.txt b/test/test_files/encoding/uchardet/it/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/it/iso-8859-1.txt rename to test/test_files/encoding/uchardet/it/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/it/utf-8.txt b/test/test_files/encoding/uchardet/it/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/it/utf-8.txt rename to test/test_files/encoding/uchardet/it/utf-8.txt diff --git a/src/uchardet/uchardet/test/ja/euc-jp.txt b/test/test_files/encoding/uchardet/ja/euc-jp.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/euc-jp.txt rename to test/test_files/encoding/uchardet/ja/euc-jp.txt diff --git a/src/uchardet/uchardet/test/ja/iso-2022-jp.txt b/test/test_files/encoding/uchardet/ja/iso-2022-jp.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/iso-2022-jp.txt rename to test/test_files/encoding/uchardet/ja/iso-2022-jp.txt diff --git a/src/uchardet/uchardet/test/ja/shift_jis.txt b/test/test_files/encoding/uchardet/ja/shift_jis.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/shift_jis.txt rename to test/test_files/encoding/uchardet/ja/shift_jis.txt diff --git a/src/uchardet/uchardet/test/ja/utf-16be.txt b/test/test_files/encoding/uchardet/ja/utf-16be.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/utf-16be.txt rename to test/test_files/encoding/uchardet/ja/utf-16be.txt diff --git a/src/uchardet/uchardet/test/ja/utf-16le.txt b/test/test_files/encoding/uchardet/ja/utf-16le.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/utf-16le.txt rename to test/test_files/encoding/uchardet/ja/utf-16le.txt diff --git a/src/uchardet/uchardet/test/ja/utf-8.txt b/test/test_files/encoding/uchardet/ja/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/ja/utf-8.txt rename to test/test_files/encoding/uchardet/ja/utf-8.txt diff --git a/src/uchardet/uchardet/test/ko/iso-2022-kr.txt b/test/test_files/encoding/uchardet/ko/iso-2022-kr.txt similarity index 100% rename from src/uchardet/uchardet/test/ko/iso-2022-kr.txt rename to test/test_files/encoding/uchardet/ko/iso-2022-kr.txt diff --git a/src/uchardet/uchardet/test/ko/uhc.smi b/test/test_files/encoding/uchardet/ko/uhc.smi similarity index 100% rename from src/uchardet/uchardet/test/ko/uhc.smi rename to test/test_files/encoding/uchardet/ko/uhc.smi diff --git a/src/uchardet/uchardet/test/ko/utf-16.le b/test/test_files/encoding/uchardet/ko/utf-16.le similarity index 100% rename from src/uchardet/uchardet/test/ko/utf-16.le rename to test/test_files/encoding/uchardet/ko/utf-16.le diff --git a/src/uchardet/uchardet/test/ko/utf-32.be b/test/test_files/encoding/uchardet/ko/utf-32.be similarity index 100% rename from src/uchardet/uchardet/test/ko/utf-32.be rename to test/test_files/encoding/uchardet/ko/utf-32.be diff --git a/src/uchardet/uchardet/test/ko/utf-8.txt b/test/test_files/encoding/uchardet/ko/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/ko/utf-8.txt rename to test/test_files/encoding/uchardet/ko/utf-8.txt diff --git a/src/uchardet/uchardet/test/lt/iso-8859-10.txt b/test/test_files/encoding/uchardet/lt/iso-8859-10.txt similarity index 100% rename from src/uchardet/uchardet/test/lt/iso-8859-10.txt rename to test/test_files/encoding/uchardet/lt/iso-8859-10.txt diff --git a/src/uchardet/uchardet/test/lt/iso-8859-13.txt b/test/test_files/encoding/uchardet/lt/iso-8859-13.txt similarity index 100% rename from src/uchardet/uchardet/test/lt/iso-8859-13.txt rename to test/test_files/encoding/uchardet/lt/iso-8859-13.txt diff --git a/src/uchardet/uchardet/test/lt/iso-8859-4.txt b/test/test_files/encoding/uchardet/lt/iso-8859-4.txt similarity index 100% rename from src/uchardet/uchardet/test/lt/iso-8859-4.txt rename to test/test_files/encoding/uchardet/lt/iso-8859-4.txt diff --git a/src/uchardet/uchardet/test/lt/utf-8.txt b/test/test_files/encoding/uchardet/lt/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/lt/utf-8.txt rename to test/test_files/encoding/uchardet/lt/utf-8.txt diff --git a/src/uchardet/uchardet/test/lv/iso-8859-10.txt b/test/test_files/encoding/uchardet/lv/iso-8859-10.txt similarity index 100% rename from src/uchardet/uchardet/test/lv/iso-8859-10.txt rename to test/test_files/encoding/uchardet/lv/iso-8859-10.txt diff --git a/src/uchardet/uchardet/test/lv/iso-8859-13.txt b/test/test_files/encoding/uchardet/lv/iso-8859-13.txt similarity index 100% rename from src/uchardet/uchardet/test/lv/iso-8859-13.txt rename to test/test_files/encoding/uchardet/lv/iso-8859-13.txt diff --git a/src/uchardet/uchardet/test/lv/iso-8859-4.txt b/test/test_files/encoding/uchardet/lv/iso-8859-4.txt similarity index 100% rename from src/uchardet/uchardet/test/lv/iso-8859-4.txt rename to test/test_files/encoding/uchardet/lv/iso-8859-4.txt diff --git a/src/uchardet/uchardet/test/lv/utf-8.txt b/test/test_files/encoding/uchardet/lv/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/lv/utf-8.txt rename to test/test_files/encoding/uchardet/lv/utf-8.txt diff --git a/src/uchardet/uchardet/test/mt/iso-8859-3.txt b/test/test_files/encoding/uchardet/mt/iso-8859-3.txt similarity index 100% rename from src/uchardet/uchardet/test/mt/iso-8859-3.txt rename to test/test_files/encoding/uchardet/mt/iso-8859-3.txt diff --git a/src/uchardet/uchardet/test/mt/utf-8.txt b/test/test_files/encoding/uchardet/mt/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/mt/utf-8.txt rename to test/test_files/encoding/uchardet/mt/utf-8.txt diff --git a/src/uchardet/uchardet/test/no/ibm865.txt b/test/test_files/encoding/uchardet/no/ibm865.txt similarity index 100% rename from src/uchardet/uchardet/test/no/ibm865.txt rename to test/test_files/encoding/uchardet/no/ibm865.txt diff --git a/src/uchardet/uchardet/test/no/iso-8859-1.txt b/test/test_files/encoding/uchardet/no/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/no/iso-8859-1.txt rename to test/test_files/encoding/uchardet/no/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/no/iso-8859-15.txt b/test/test_files/encoding/uchardet/no/iso-8859-15.txt similarity index 100% rename from src/uchardet/uchardet/test/no/iso-8859-15.txt rename to test/test_files/encoding/uchardet/no/iso-8859-15.txt diff --git a/src/uchardet/uchardet/test/no/utf-8.txt b/test/test_files/encoding/uchardet/no/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/no/utf-8.txt rename to test/test_files/encoding/uchardet/no/utf-8.txt diff --git a/src/uchardet/uchardet/test/no/windows-1252.txt b/test/test_files/encoding/uchardet/no/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/no/windows-1252.txt rename to test/test_files/encoding/uchardet/no/windows-1252.txt diff --git a/src/uchardet/uchardet/test/pl/ibm852.txt b/test/test_files/encoding/uchardet/pl/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/ibm852.txt rename to test/test_files/encoding/uchardet/pl/ibm852.txt diff --git a/src/uchardet/uchardet/test/pl/iso-8859-13.txt b/test/test_files/encoding/uchardet/pl/iso-8859-13.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/iso-8859-13.txt rename to test/test_files/encoding/uchardet/pl/iso-8859-13.txt diff --git a/src/uchardet/uchardet/test/pl/iso-8859-16.txt b/test/test_files/encoding/uchardet/pl/iso-8859-16.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/iso-8859-16.txt rename to test/test_files/encoding/uchardet/pl/iso-8859-16.txt diff --git a/src/uchardet/uchardet/test/pl/iso-8859-2.txt b/test/test_files/encoding/uchardet/pl/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/iso-8859-2.txt rename to test/test_files/encoding/uchardet/pl/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/pl/mac-centraleurope.txt b/test/test_files/encoding/uchardet/pl/mac-centraleurope.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/mac-centraleurope.txt rename to test/test_files/encoding/uchardet/pl/mac-centraleurope.txt diff --git a/src/uchardet/uchardet/test/pl/utf-8.txt b/test/test_files/encoding/uchardet/pl/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/utf-8.txt rename to test/test_files/encoding/uchardet/pl/utf-8.txt diff --git a/src/uchardet/uchardet/test/pl/windows-1250.txt b/test/test_files/encoding/uchardet/pl/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/pl/windows-1250.txt rename to test/test_files/encoding/uchardet/pl/windows-1250.txt diff --git a/src/uchardet/uchardet/test/pt/iso-8859-1.txt b/test/test_files/encoding/uchardet/pt/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/pt/iso-8859-1.txt rename to test/test_files/encoding/uchardet/pt/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/pt/utf-8.txt b/test/test_files/encoding/uchardet/pt/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/pt/utf-8.txt rename to test/test_files/encoding/uchardet/pt/utf-8.txt diff --git a/src/uchardet/uchardet/test/ro/ibm852.txt b/test/test_files/encoding/uchardet/ro/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/ro/ibm852.txt rename to test/test_files/encoding/uchardet/ro/ibm852.txt diff --git a/src/uchardet/uchardet/test/ro/iso-8859-16.txt b/test/test_files/encoding/uchardet/ro/iso-8859-16.txt similarity index 100% rename from src/uchardet/uchardet/test/ro/iso-8859-16.txt rename to test/test_files/encoding/uchardet/ro/iso-8859-16.txt diff --git a/src/uchardet/uchardet/test/ro/utf-8.txt b/test/test_files/encoding/uchardet/ro/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/ro/utf-8.txt rename to test/test_files/encoding/uchardet/ro/utf-8.txt diff --git a/src/uchardet/uchardet/test/ro/windows-1250.txt b/test/test_files/encoding/uchardet/ro/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/ro/windows-1250.txt rename to test/test_files/encoding/uchardet/ro/windows-1250.txt diff --git a/src/uchardet/uchardet/test/ru/ibm855.txt b/test/test_files/encoding/uchardet/ru/ibm855.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/ibm855.txt rename to test/test_files/encoding/uchardet/ru/ibm855.txt diff --git a/src/uchardet/uchardet/test/ru/ibm866.txt b/test/test_files/encoding/uchardet/ru/ibm866.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/ibm866.txt rename to test/test_files/encoding/uchardet/ru/ibm866.txt diff --git a/src/uchardet/uchardet/test/ru/iso-8859-5.txt b/test/test_files/encoding/uchardet/ru/iso-8859-5.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/iso-8859-5.txt rename to test/test_files/encoding/uchardet/ru/iso-8859-5.txt diff --git a/src/uchardet/uchardet/test/ru/koi8-r.txt b/test/test_files/encoding/uchardet/ru/koi8-r.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/koi8-r.txt rename to test/test_files/encoding/uchardet/ru/koi8-r.txt diff --git a/src/uchardet/uchardet/test/ru/mac-cyrillic.txt b/test/test_files/encoding/uchardet/ru/mac-cyrillic.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/mac-cyrillic.txt rename to test/test_files/encoding/uchardet/ru/mac-cyrillic.txt diff --git a/src/uchardet/uchardet/test/ru/windows-1251.txt b/test/test_files/encoding/uchardet/ru/windows-1251.txt similarity index 100% rename from src/uchardet/uchardet/test/ru/windows-1251.txt rename to test/test_files/encoding/uchardet/ru/windows-1251.txt diff --git a/src/uchardet/uchardet/test/sk/ibm852.txt b/test/test_files/encoding/uchardet/sk/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/sk/ibm852.txt rename to test/test_files/encoding/uchardet/sk/ibm852.txt diff --git a/src/uchardet/uchardet/test/sk/iso-8859-2.txt b/test/test_files/encoding/uchardet/sk/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/sk/iso-8859-2.txt rename to test/test_files/encoding/uchardet/sk/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/sk/mac-centraleurope.txt b/test/test_files/encoding/uchardet/sk/mac-centraleurope.txt similarity index 100% rename from src/uchardet/uchardet/test/sk/mac-centraleurope.txt rename to test/test_files/encoding/uchardet/sk/mac-centraleurope.txt diff --git a/src/uchardet/uchardet/test/sk/utf-8.txt b/test/test_files/encoding/uchardet/sk/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/sk/utf-8.txt rename to test/test_files/encoding/uchardet/sk/utf-8.txt diff --git a/src/uchardet/uchardet/test/sk/windows-1250.txt b/test/test_files/encoding/uchardet/sk/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/sk/windows-1250.txt rename to test/test_files/encoding/uchardet/sk/windows-1250.txt diff --git a/src/uchardet/uchardet/test/sl/ibm852.txt b/test/test_files/encoding/uchardet/sl/ibm852.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/ibm852.txt rename to test/test_files/encoding/uchardet/sl/ibm852.txt diff --git a/src/uchardet/uchardet/test/sl/iso-8859-16.txt b/test/test_files/encoding/uchardet/sl/iso-8859-16.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/iso-8859-16.txt rename to test/test_files/encoding/uchardet/sl/iso-8859-16.txt diff --git a/src/uchardet/uchardet/test/sl/iso-8859-2.txt b/test/test_files/encoding/uchardet/sl/iso-8859-2.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/iso-8859-2.txt rename to test/test_files/encoding/uchardet/sl/iso-8859-2.txt diff --git a/src/uchardet/uchardet/test/sl/mac-centraleurope.txt b/test/test_files/encoding/uchardet/sl/mac-centraleurope.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/mac-centraleurope.txt rename to test/test_files/encoding/uchardet/sl/mac-centraleurope.txt diff --git a/src/uchardet/uchardet/test/sl/utf-8.txt b/test/test_files/encoding/uchardet/sl/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/utf-8.txt rename to test/test_files/encoding/uchardet/sl/utf-8.txt diff --git a/src/uchardet/uchardet/test/sl/windows-1250.txt b/test/test_files/encoding/uchardet/sl/windows-1250.txt similarity index 100% rename from src/uchardet/uchardet/test/sl/windows-1250.txt rename to test/test_files/encoding/uchardet/sl/windows-1250.txt diff --git a/src/uchardet/uchardet/test/sv/iso-8859-1.txt b/test/test_files/encoding/uchardet/sv/iso-8859-1.txt similarity index 100% rename from src/uchardet/uchardet/test/sv/iso-8859-1.txt rename to test/test_files/encoding/uchardet/sv/iso-8859-1.txt diff --git a/src/uchardet/uchardet/test/sv/utf-8.txt b/test/test_files/encoding/uchardet/sv/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/sv/utf-8.txt rename to test/test_files/encoding/uchardet/sv/utf-8.txt diff --git a/src/uchardet/uchardet/test/sv/windows-1252.txt b/test/test_files/encoding/uchardet/sv/windows-1252.txt similarity index 100% rename from src/uchardet/uchardet/test/sv/windows-1252.txt rename to test/test_files/encoding/uchardet/sv/windows-1252.txt diff --git a/src/uchardet/uchardet/test/th/iso-8859-11.txt b/test/test_files/encoding/uchardet/th/iso-8859-11.txt similarity index 100% rename from src/uchardet/uchardet/test/th/iso-8859-11.txt rename to test/test_files/encoding/uchardet/th/iso-8859-11.txt diff --git a/src/uchardet/uchardet/test/th/tis-620.txt b/test/test_files/encoding/uchardet/th/tis-620.txt similarity index 100% rename from src/uchardet/uchardet/test/th/tis-620.txt rename to test/test_files/encoding/uchardet/th/tis-620.txt diff --git a/src/uchardet/uchardet/test/th/utf-8.txt b/test/test_files/encoding/uchardet/th/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/th/utf-8.txt rename to test/test_files/encoding/uchardet/th/utf-8.txt diff --git a/src/uchardet/uchardet/test/tr/iso-8859-3.txt b/test/test_files/encoding/uchardet/tr/iso-8859-3.txt similarity index 100% rename from src/uchardet/uchardet/test/tr/iso-8859-3.txt rename to test/test_files/encoding/uchardet/tr/iso-8859-3.txt diff --git a/src/uchardet/uchardet/test/tr/iso-8859-9.txt b/test/test_files/encoding/uchardet/tr/iso-8859-9.txt similarity index 100% rename from src/uchardet/uchardet/test/tr/iso-8859-9.txt rename to test/test_files/encoding/uchardet/tr/iso-8859-9.txt diff --git a/src/uchardet/uchardet/test/uchardet-tests.c b/test/test_files/encoding/uchardet/uchardet-tests.c similarity index 100% rename from src/uchardet/uchardet/test/uchardet-tests.c rename to test/test_files/encoding/uchardet/uchardet-tests.c diff --git a/src/uchardet/uchardet/test/vi/utf-8.txt b/test/test_files/encoding/uchardet/vi/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/vi/utf-8.txt rename to test/test_files/encoding/uchardet/vi/utf-8.txt diff --git a/src/uchardet/uchardet/test/vi/viscii.txt b/test/test_files/encoding/uchardet/vi/viscii.txt similarity index 100% rename from src/uchardet/uchardet/test/vi/viscii.txt rename to test/test_files/encoding/uchardet/vi/viscii.txt diff --git a/src/uchardet/uchardet/test/vi/windows-1258.txt b/test/test_files/encoding/uchardet/vi/windows-1258.txt similarity index 100% rename from src/uchardet/uchardet/test/vi/windows-1258.txt rename to test/test_files/encoding/uchardet/vi/windows-1258.txt diff --git a/src/uchardet/uchardet/test/zh/big5.txt b/test/test_files/encoding/uchardet/zh/big5.txt similarity index 100% rename from src/uchardet/uchardet/test/zh/big5.txt rename to test/test_files/encoding/uchardet/zh/big5.txt diff --git a/src/uchardet/uchardet/test/zh/euc-tw.txt b/test/test_files/encoding/uchardet/zh/euc-tw.txt similarity index 100% rename from src/uchardet/uchardet/test/zh/euc-tw.txt rename to test/test_files/encoding/uchardet/zh/euc-tw.txt diff --git a/src/uchardet/uchardet/test/zh/gb18030.txt b/test/test_files/encoding/uchardet/zh/gb18030.txt similarity index 100% rename from src/uchardet/uchardet/test/zh/gb18030.txt rename to test/test_files/encoding/uchardet/zh/gb18030.txt diff --git a/src/uchardet/uchardet/test/zh/utf-8.txt b/test/test_files/encoding/uchardet/zh/utf-8.txt similarity index 100% rename from src/uchardet/uchardet/test/zh/utf-8.txt rename to test/test_files/encoding/uchardet/zh/utf-8.txt