From 8df53d85e472a1dad75f20928399b0f67eb47df4 Mon Sep 17 00:00:00 2001 From: Pedro Maciel Xavier Date: Tue, 4 Apr 2023 09:19:16 -0300 Subject: [PATCH] Fix encoding table error --- Project.toml | 2 +- src/tables.jl | 299 ++++++++++++++++------------------------------- test/runtests.jl | 2 +- 3 files changed, 105 insertions(+), 198 deletions(-) diff --git a/Project.toml b/Project.toml index e0e439f..82a97dc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Encodings" uuid = "8275c4fe-57c3-4fbf-b39c-271e6148849a" authors = ["pedromxavier "] -version = "0.1.0" +version = "0.1.1" [compat] julia = "1.6" diff --git a/src/tables.jl b/src/tables.jl index 440af14..66176b7 100644 --- a/src/tables.jl +++ b/src/tables.jl @@ -1,199 +1,106 @@ # ISO_LATIN_1 tables from: -# https://www.ime.usp.br/~pf/algoritmos/apend/iso-latin-1.html -const ENCODE_ISO_LATIN_1 = Dict{Char, UInt8}( - ' ' => 0xA0, # non-breaking space - '¡' => 0xA1, # exclamação invertida - '¢' => 0xA2, # - '£' => 0xA3, # libra - '¤' => 0xA4, # moeda genérica - '¥' => 0xA5, # yen - '¦' => 0xA6, # - '§' => 0xA7, # parágrafo - '¨' => 0xA8, # trema - '©' => 0xA9, # copyright - 'ª' => 0xAA, # ordinal feminino - '«' => 0xAB, # abre-aspas europeu - '¬' => 0xAC, # negação lógica - '­' => 0xAD, # soft hyphen - '®' => 0xAE, # registrado - '¯' => 0xAF, # overscore - '°' => 0xB0, # grau - '±' => 0xB1, # mais ou menos - '²' => 0xB2, # ao quadrado - '³' => 0xB3, # ao cubo - '´' => 0xB4, # acento agudo - 'µ' => 0xB5, # micro, mü - '¶' => 0xB6, # parágrafo - '·' => 0xB7, # middle dot - '¸' => 0xB8, # cedilha - '¹' => 0xB9, # elevado a 1 - 'º' => 0xBA, # ordinal masculino - '»' => 0xBB, # fecha-aspas europeu - '¼' => 0xBC, # - '½' => 0xBD, # - '¾' => 0xBE, # - '¿' => 0xBF, # interrogação invertida - 'À' => 0xC0, # - 'Á' => 0xC1, # - 'Â' => 0xC2, # - 'Ã' => 0xC3, # - 'Ä' => 0xC4, # - 'Å' => 0xC5, # - 'Æ' => 0xC6, # ligadura AE - 'Ç' => 0xC7, # - 'È' => 0xC8, # - 'É' => 0xC9, # - 'Ê' => 0xCA, # - 'Ë' => 0xCB, # - 'Ì' => 0xCC, # - 'Í' => 0xCD, # - 'Î' => 0xCE, # - 'Ï' => 0xCF, # - 'Ð' => 0xD0, # Eth (islândico) - 'Ñ' => 0xD1, # - 'Ò' => 0xD2, # - 'Ó' => 0xD3, # - 'Ô' => 0xD4, # - 'Õ' => 0xD5, # - 'Ö' => 0xD6, # - '×' => 0xD7, # multiplicação - 'Ø' => 0xD8, # - 'Ù' => 0xD9, # - 'Ú' => 0xDA, # - 'Û' => 0xDB, # - 'Ü' => 0xDC, # - 'Ý' => 0xDD, # - 'Þ' => 0xDE, # Thorn (islândico) - 'ß' => 0xDF, # ligadura sz (alemão) - 'à' => 0xE0, # - 'á' => 0xE1, # - 'â' => 0xE2, # - 'ã' => 0xE3, # - 'ä' => 0xE4, # - 'å' => 0xE5, # - 'æ' => 0xE6, # ligadura ae - 'ç' => 0xE7, # - 'è' => 0xE8, # - 'é' => 0xE0, # - 'ê' => 0xEA, # - 'ë' => 0xEB, # - 'ì' => 0xEC, # - 'í' => 0xED, # - 'î' => 0xEE, # - 'ï' => 0xEF, # - 'ð' => 0xF0, # eth (islândico) - 'ñ' => 0xF1, # - 'ò' => 0xF2, # - 'ó' => 0xF3, # - 'ô' => 0xF4, # - 'õ' => 0xF5, # - 'ö' => 0xF6, # - '÷' => 0xF7, # divisão - 'ø' => 0xF8, # - 'ù' => 0xF9, # - 'ú' => 0xFA, # - 'û' => 0xFB, # - 'ü' => 0xFC, # - 'ý' => 0xFD, # - 'þ' => 0xFE, # thorn (islândico) - 'ÿ' => 0xFF, # -) +# https://cs.stanford.edu/people/miles/iso8859.html const DECODE_ISO_LATIN_1 = Dict{UInt8, Char}( - 0xA0 => ' ', # non-breaking space - 0xA1 => '¡', # exclamação invertida - 0xA2 => '¢', # - 0xA3 => '£', # libra - 0xA4 => '¤', # moeda genérica - 0xA5 => '¥', # yen - 0xA6 => '¦', # - 0xA7 => '§', # parágrafo - 0xA8 => '¨', # trema - 0xA9 => '©', # copyright - 0xAA => 'ª', # ordinal feminino - 0xAB => '«', # abre-aspas europeu - 0xAC => '¬', # negação lógica - 0xAD => '­' , # soft hyphen - 0xAE => '®', # registrado - 0xAF => '¯', # overscore - 0xB0 => '°', # grau - 0xB1 => '±', # mais ou menos - 0xB2 => '²', # ao quadrado - 0xB3 => '³', # ao cubo - 0xB4 => '´', # acento agudo - 0xB5 => 'µ', # micro, mü - 0xB6 => '¶', # parágrafo - 0xB7 => '·', # middle dot - 0xB8 => '¸', # cedilha - 0xB9 => '¹', # elevado a 1 - 0xBA => 'º', # ordinal masculino - 0xBB => '»', # fecha-aspas europeu - 0xBC => '¼', # - 0xBD => '½', # - 0xBE => '¾', # - 0xBF => '¿', # interrogação invertida - 0xC0 => 'À', # - 0xC1 => 'Á', # - 0xC2 => 'Â', # - 0xC3 => 'Ã', # - 0xC4 => 'Ä', # - 0xC5 => 'Å', # - 0xC6 => 'Æ', # ligadura AE - 0xC7 => 'Ç', # - 0xC8 => 'È', # - 0xC9 => 'É', # - 0xCA => 'Ê', # - 0xCB => 'Ë', # - 0xCC => 'Ì', # - 0xCD => 'Í', # - 0xCE => 'Î', # - 0xCF => 'Ï', # - 0xD0 => 'Ð', # Eth (islândico) - 0xD1 => 'Ñ', # - 0xD2 => 'Ò', # - 0xD3 => 'Ó', # - 0xD4 => 'Ô', # - 0xD5 => 'Õ', # - 0xD6 => 'Ö', # - 0xD7 => '×', # multiplicação - 0xD8 => 'Ø', # - 0xD9 => 'Ù', # - 0xDA => 'Ú', # - 0xDB => 'Û', # - 0xDC => 'Ü', # - 0xDD => 'Ý', # - 0xDE => 'Þ', # Thorn (islândico) - 0xDF => 'ß', # ligadura sz (alemão) - 0xE0 => 'à', # - 0xE1 => 'á', # - 0xE2 => 'â', # - 0xE3 => 'ã', # - 0xE4 => 'ä', # - 0xE5 => 'å', # - 0xE6 => 'æ', # ligadura ae - 0xE7 => 'ç', # - 0xE8 => 'è', # - 0xE0 => 'é', # - 0xEA => 'ê', # - 0xEB => 'ë', # - 0xEC => 'ì', # - 0xED => 'í', # - 0xEE => 'î', # - 0xEF => 'ï', # - 0xF0 => 'ð', # eth (islândico) - 0xF1 => 'ñ', # - 0xF2 => 'ò', # - 0xF3 => 'ó', # - 0xF4 => 'ô', # - 0xF5 => 'õ', # - 0xF6 => 'ö', # - 0xF7 => '÷', # divisão - 0xF8 => 'ø', # - 0xF9 => 'ù', # - 0xFA => 'ú', # - 0xFB => 'û', # - 0xFC => 'ü', # - 0xFD => 'ý', # - 0xFE => 'þ', # thorn (islândico) - 0xFF => 'ÿ', # -) \ No newline at end of file + # Special Characters + 0xA0 => ' ', # reaking space + 0xA1 => '¡', # inverted exclamation mark + 0xA2 => '¢', # cent sign + 0xA3 => '£', # pound sterling sign + 0xA4 => '¤', # general currency sign + 0xA5 => '¥', # yen sign + 0xA6 => '¦', # ¦ broken vertical bar + 0xA7 => '§', # section sign + 0xA8 => '¨', # ¨ spacing dieresis or umlaut + 0xA9 => '©', # copyright sign + 0xAA => 'ª', # feminine ordinal sign + 0xAB => '«', # left double angle quote or guillemet + 0xAC => '¬', # logical not sign + 0xAD => '­', # soft hyphen + 0xAE => '®', # registered trademark sign + 0xAF => '¯', # &hibar; spacing macron long accent + 0xB0 => '°', # degree sign + 0xB1 => '±', # plus-or-minus sign + 0xB2 => '²', # superscript 2 + 0xB3 => '³', # superscript 3 + 0xB4 => '´', # spacing accute accent + 0xB5 => 'µ', # micro sign, mu + 0xB6 => '¶', # paragraph sign, pilcrow sign + 0xB7 => '·', # middle dot, centered dot + 0xB8 => '¸', # spacing cedilla + 0xB9 => '¹', # superscript 1 + 0xBA => 'º', # masculine ordinal indicator + 0xBB => '»', # right double angle quote or guillemet + 0xBC => '¼', # fraction 1/4 + 0xBD => '½', # ½ fraction 1/2 + 0xBE => '¾', # fraction 3/4 + 0xBF => '¿', # inverted question mark + # Upper Case Latin-1 Letters + 0xC0 => 'À', # capital A grave + 0xC1 => 'Á', # capital A acute + 0xC2 => 'Â', # capital A circumflex + 0xC3 => 'Ã', # capital A tilde + 0xC4 => 'Ä', # capital A dieresis or umlaut + 0xC5 => 'Å', # capital A ring + 0xC6 => 'Æ', # capital AE ligature + 0xC7 => 'Ç', # capital C cedilla + 0xC8 => 'È', # capital E grave + 0xC9 => 'É', # capital E acute + 0xCA => 'Ê', # capital E circumflex + 0xCB => 'Ë', # capital E dieresis or umlaut + 0xCC => 'Ì', # capital I grave + 0xCD => 'Í', # capital I acute + 0xCE => 'Î', # capital I circumflex + 0xCF => 'Ï', # capital I dieresis or umlaut + 0xD0 => 'Ð', # capital ETH + 0xD1 => 'Ñ', # capital N tilde + 0xD2 => 'Ò', # capital O grave + 0xD3 => 'Ó', # capital O acute + 0xD4 => 'Ô', # capital O circumflex + 0xD5 => 'Õ', # capital O tilde + 0xD6 => 'Ö', # capital O dieresis or umlaut + 0xD7 => '×', # multiplication sign + 0xD8 => 'Ø', # capital O slash + 0xD9 => 'Ù', # capital U grave + 0xDA => 'Ú', # capital U acute + 0xDB => 'Û', # capital U circumflex + 0xDC => 'Ü', # capital U dieresis or umlaut + 0xDD => 'Ý', # capital Y acute + 0xDE => 'Þ', # capital THORN + 0xDF => 'ß', # small sharp s, sz ligature + # Lower Case Latin-1 Letters + 0xE0 => 'à', # small a grave + 0xE1 => 'á', # small a acute + 0xE2 => 'â', # small a circumflex + 0xE3 => 'ã', # small a tilde + 0xE4 => 'ä', # small a dieresis or umlaut + 0xE5 => 'å', # small a ring + 0xE6 => 'æ', # small ae ligature + 0xE7 => 'ç', # small c cedilla + 0xE8 => 'è', # small e grave + 0xE9 => 'é', # small e acute + 0xEA => 'ê', # small e circumflex + 0xEB => 'ë', # small e dieresis or umlaut + 0xEC => 'ì', # small i grave + 0xED => 'í', # small i acute + 0xEE => 'î', # small i circumflex + 0xEF => 'ï', # small i dieresis or umlaut + 0xF0 => 'ð', # small eth + 0xF1 => 'ñ', # small n tilde + 0xF2 => 'ò', # small o grave + 0xF3 => 'ó', # small o acute + 0xF4 => 'ô', # small o circumflex + 0xF5 => 'õ', # small o tilde + 0xF6 => 'ö', # small o dieresis or umlaut + 0xF7 => '÷', # division sign + 0xF8 => 'ø', # small o slash + 0xF9 => 'ù', # small u grave + 0xFA => 'ú', # small u acute + 0xFB => 'û', # small u circumflex + 0xFC => 'ü', # small u dieresis or umlaut + 0xFD => 'ý', # small y acute + 0xFE => 'þ', # small thorn + 0xFF => 'ÿ', # small y dieresis or umlaut +) + +const ENCODE_ISO_LATIN_1 = Dict{Char, UInt8}(char => code for (code, char) in DECODE_ISO_LATIN_1) diff --git a/test/runtests.jl b/test/runtests.jl index 60ec1fb..1bf0b9b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,7 @@ using Test using Encodings text = "Café com pão" -data = UInt8[0x43, 0x61, 0x66, 0xe0, 0x20, 0x63, 0x6f, 0x6d, 0x20, 0x70, 0xe3, 0x6f] +data = UInt8[0x43, 0x61, 0x66, 0xe9, 0x20, 0x63, 0x6f, 0x6d, 0x20, 0x70, 0xe3, 0x6f] @testset "Encoding :: ISO-LATIN-1" begin @test encode(text, Encodings.ISO_LATIN_1()) == data