Delphi World - Автоматическое определения кодировки текста
Delphi World - это проект, являющийся сборником статей и малодокументированных возможностей  по программированию в среде Delphi. Здесь вы найдёте работы по следующим категориям: delphi, delfi, borland, bds, дельфи, делфи, дэльфи, дэлфи, programming, example, программирование, исходные коды, code, исходники, source, sources, сорцы, сорсы, soft, programs, программы, and, how, delphiworld, базы данных, графика, игры, интернет, сети, компоненты, классы, мультимедиа, ос, железо, программа, интерфейс, рабочий стол, синтаксис, технологии, файловая система...
Автоматическое определения кодировки текста

Автор: Stas Malinovski

<АБЫРВАЛГ!>, сказал линyкс после русификации.

Методом таблицы модельных распределений:


type
  TCodePage = (cpWin1251, cp866, cpKOI8R);
  PMap = ^TMap;
  TMap = array[#$80..#$FF] of Char;

function GetMap(CP: TCodePage): PMap;
{ должна возвращать указатель на таблицу перекодировки из CP в Windows1251
(nil для CP = cpWin1251) }
begin
  GetMap := nil;
end;

function DetermineRussian(Buf: PChar; Count: Integer): TCodePage;
const
  ModelBigrams: array[0..33, 0..33] of Byte = (
    {АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_?}
    {А}(0, 20, 44, 12, 22, 23, 16, 60, 4, 9, 63, 93, 47, 110, 0, 16, 35, 61, 81,
      1, 5, 13, 24, 17, 12, 4, 0, 0, 0, 0, 14, 31, 205, 1),
    {Б}(19, 0, 0, 0, 4, 19, 0, 0, 8, 0, 2, 15, 1, 4, 41, 0, 15, 5, 0, 15, 0, 2,
      1, 0, 0, 6, 16, 37, 0, 0, 0, 4, 3, 0),
    {В}(97, 0, 1, 0, 2, 57, 0, 5, 40, 0, 4, 25, 2, 23, 78, 2, 8, 28, 4, 12, 0,
      1, 0, 0, 8, 1, 0, 40, 1, 0, 0, 5, 106, 3),
    {Г}(13, 0, 0, 0, 9, 5, 0, 0, 15, 0, 1, 17, 1, 2, 96, 0, 24, 0, 0, 7, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0),
    {Д}(63, 0, 9, 1, 2, 71, 1, 0, 35, 0, 3, 16, 2, 22, 50, 2, 19, 9, 2, 25, 0,
      2, 1, 0, 1, 0, 1, 9, 4, 0, 1, 5, 17, 4),
    {Е}(4, 14, 15, 34, 56, 22, 13, 14, 2, 34, 39, 77, 73, 150, 6, 9, 101, 64,
      81, 1, 0, 15, 5, 12, 10, 6, 0, 0, 0, 0, 3, 4, 235, 1),
    {Ж}(13, 0, 0, 0, 12, 47, 0, 0, 16, 0, 1, 0, 0, 23, 0, 0, 0, 0, 0, 3, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2),
    {З}(76, 2, 11, 3, 11, 4, 1, 0, 7, 0, 2, 4, 11, 24, 17, 0, 6, 1, 0, 8, 0, 0,
      0, 0, 0, 0, 0, 16, 6, 0, 1, 4, 17, 0),
    {И}(7, 9, 32, 5, 18, 60, 4, 42, 31, 27, 28, 46, 55, 49, 12, 7, 26, 60, 53,
      0, 5, 25, 14, 28, 4, 1, 0, 0, 0, 0, 9, 56, 255, 0),
    {Й}(0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 10, 3, 0, 0, 0, 0, 1,
      1, 0, 0, 0, 0, 0, 0, 0, 122, 0),
    {К}(92, 0, 3, 0, 0, 7, 2, 1, 39, 0, 0, 27, 0, 14, 110, 0, 18, 5, 35, 18, 0,
      0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0),
    {Л}(85, 1, 0, 2, 1, 70, 6, 0, 85, 0, 5, 3, 0, 9, 67, 1, 0, 9, 0, 15, 0, 0,
      0, 2, 0, 0, 0, 9, 66, 0, 15, 43, 57, 4),
    {М}(44, 0, 0, 0, 0, 65, 0, 0, 47, 0, 1, 1, 10, 15, 57, 7, 0, 2, 0, 24, 0, 0,
      0, 0, 0, 0, 0, 28, 0, 0, 0, 8, 109, 3),
    {}(139, 0, 0, 1, 11, 108, 0, 4, 152, 0, 7, 0, 1, 69, 161, 0, 0, 8, 25, 24,
      5, 1, 5, 2, 0, 1, 0, 83, 10, 0, 1, 29, 38, 5),
    {О}(0, 72, 139, 76, 74, 32, 32, 19, 12, 52, 21, 93, 68, 72, 7, 34, 93, 102,
      98, 1, 2, 6, 6, 19, 15, 2, 0, 0, 0, 1, 4, 9, 252, 2),
    {П}(17, 0, 0, 0, 0, 43, 0, 0, 14, 0, 1, 9, 0, 1, 125, 3, 120, 1, 2, 8, 0, 0,
      0, 0, 0, 0, 0, 3, 6, 0, 0, 3, 2, 2),
    {Р}(151, 1, 6, 4, 3, 103, 7, 0, 76, 0, 4, 0, 11, 10, 117, 1, 0, 5, 9, 39, 2,
      5, 0, 1, 3, 0, 0, 24, 7, 0, 1, 10, 22, 5),
    {С}(24, 1, 21, 0, 3, 39, 0, 0, 33, 0, 56, 41, 11, 15, 58, 30, 5, 30, 183,
      16, 0, 4, 1, 4, 1, 0, 0, 8, 25, 0, 1, 50, 41, 2),
    {Т}(83, 0, 43, 0, 3, 87, 0, 0, 71, 0, 9, 3, 2, 26, 180, 0, 55, 33, 1, 23, 1,
      0, 1, 4, 0, 0, 0, 20, 78, 0, 0, 5, 82, 4),
    {У}(3, 6, 7, 14, 19, 8, 13, 6, 0, 1, 13, 15, 10, 7, 0, 12, 17, 16, 19, 0, 1,
      3, 0, 12, 5, 8, 0, 0, 0, 0, 22, 1, 65, 0),
    {Ф}(4, 0, 0, 0, 0, 4, 0, 0, 11, 0, 0, 1, 0, 0, 9, 0, 3, 0, 0, 4, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 2, 0),
    {Х}(9, 0, 2, 0, 0, 2, 0, 0, 5, 0, 0, 1, 0, 5, 26, 0, 4, 1, 0, 1, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 76, 0),
    {Ц}(5, 0, 0, 0, 0, 16, 0, 0, 48, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0,
      0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0),
    {Ч}(30, 0, 0, 0, 0, 52, 0, 0, 23, 0, 3, 1, 0, 14, 1, 0, 0, 0, 36, 5, 0, 0,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2),
    {Ш}(13, 0, 0, 0, 0, 28, 0, 0, 17, 0, 4, 4, 0, 4, 3, 0, 0, 0, 1, 3, 0, 0, 0,
      0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 1),
    {Щ}(6, 0, 0, 0, 0, 23, 0, 0, 16, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1),
    {Ъ}(0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 1, 1, 0, 0),
    {Ы}(0, 5, 14, 1, 3, 28, 0, 2, 0, 22, 6, 19, 21, 2, 0, 5, 4, 7, 10, 0, 0, 37,
      0, 3, 4, 0, 0, 0, 0, 0, 0, 1, 84, 0),
    {Ь}(0, 1, 0, 0, 0, 9, 0, 10, 1, 0, 13, 0, 2, 26, 0, 0, 0, 10, 3, 0, 0, 0, 1,
      0, 6, 0, 0, 0, 0, 0, 6, 4, 117, 0),
    {Э}(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 31, 0, 1, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    {Ю}(0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 15, 0, 0, 0, 1, 4,
      1, 15, 0, 0, 0, 0, 0, 0, 38, 0),
    {Я}(0, 0, 9, 2, 7, 10, 3, 19, 0, 0, 1, 6, 7, 8, 0, 0, 2, 6, 19, 0, 0, 3, 5,
      1, 0, 3, 0, 0, 0, 0, 5, 2, 177, 0),
    {_}(42, 80, 193, 43, 109, 41, 18, 53, 159, 0, 144, 27, 83, 176, 187, 229,
      70, 231, 99, 47, 15, 13, 6, 58, 7, 0, 0, 0, 0, 38, 0, 22, 0, 2),
    {?}(0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 4, 4, 8, 0, 0, 5, 3, 4, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
  { " рейтинг"  буквы ? условно принимается равным 1/20 от " рейтинга"  буквы E,
  если сочетание с участием ? корректно, иначе - 0 }
type
  TVariation = array[0..33, 0..33] of Integer;
var
  I, J, iC, iPredC, Max: Integer;
  C: Char;
  CP: TCodePage;
  D, MinD, Factor: Double;
  AMap: PMap;
  PV: ^TVariation;
  Vars: array[TCodePage] of TVariation;
begin
  DetermineRussian := cpWin1251; { по yмолчанию }
  { вычисление распределений биграмм }
  FillChar(Vars, SizeOf(Vars), 0);
  for CP := Low(Vars) to High(Vars) do
  begin
    AMap := GetMap(CP);
    PV := @Vars[CP];
    iPredC := 32;
    for I := 0 to Count - 1 do
    begin
      C := Buf[I];
      iC := 32;
      if C > = #128 then
      begin
        if AMap < > nil then
          C := AMap^[C];
        if not (C in ['?', '?']) then
        begin
          C := Chr(Ord(C) and not 32); { 'a'..'я' ->  'А'..'Я' }
          if C in ['А'..'Я'] then
            iC := Ord(C) - Ord('А');
        end
        else
          iC := 33;
      end;
      Inc(PV^[iPredC, iC]);
      iPredC := iC;
    end;
  end;
  { вычисление метрики и определение наиболее правдоподобной кодировки }
  MinD := 0;
  for CP := Low(Vars) to High(Vars) do
  begin
    PV := @Vars[CP];
    PV^[32, 32] := 0;
    Max := 1;
    for I := 0 to 33 do
      for J := 0 to 33 do
        if PV^[I, J] > Max then
          Max := PV^[I, J];
    Factor := 255 / Max; { ноpмализация }
    D := 0;
    for I := 0 to 33 do
      for J := 0 to 33 do
        D := D + Abs(PV^[I, J] * Factor - ModelBigrams[I, J]);
    if (MinD = 0) or (D < MinD) then
    begin
      MinD := D;
      DetermineRussian := CP;
    end;
  end;
end;

begin
  { тест: слово 'Пример' в разных кодировках (веpоятность ошибок на таких
  коpотких текстах высока - в данном слyчае пpосто повезло!) }
  writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251);
  writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866);
  writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R);
  readln;
end.

Проект Delphi World © Выпуск 2002 - 2017
Автор проекта: Эксклюзивные курсы программирования