Skip to content

Commit

Permalink
Fix encoding table error
Browse files Browse the repository at this point in the history
  • Loading branch information
pedromxavier committed Apr 4, 2023
1 parent f936daa commit 8df53d8
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 198 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Encodings"
uuid = "8275c4fe-57c3-4fbf-b39c-271e6148849a"
authors = ["pedromxavier <[email protected]>"]
version = "0.1.0"
version = "0.1.1"

[compat]
julia = "1.6"
299 changes: 103 additions & 196 deletions src/tables.jl
Original file line number Diff line number Diff line change
@@ -1,199 +1,106 @@
# ISO_LATIN_1 tables from:
# https://www.ime.usp.br/~pf/algoritmos/apend/iso-latin-1.html
const ENCODE_ISO_LATIN_1 = Dict{Char, UInt8}(
' ' => 0xA0, # non-breaking space
'¡' => 0xA1, # exclamação invertida
'¢' => 0xA2, #
'£' => 0xA3, # libra
'¤' => 0xA4, # moeda genérica
'¥' => 0xA5, # yen
'¦' => 0xA6, #
'§' => 0xA7, # parágrafo
'¨' => 0xA8, # trema
'©' => 0xA9, # copyright
'ª' => 0xAA, # ordinal feminino
'«' => 0xAB, # abre-aspas europeu
'¬' => 0xAC, # negação lógica
'­' => 0xAD, # soft hyphen
'®' => 0xAE, # registrado
'¯' => 0xAF, # overscore
'°' => 0xB0, # grau
'±' => 0xB1, # mais ou menos
'²' => 0xB2, # ao quadrado
'³' => 0xB3, # ao cubo
'´' => 0xB4, # acento agudo
'µ' => 0xB5, # micro, mü
'' => 0xB6, # parágrafo
'·' => 0xB7, # middle dot
'¸' => 0xB8, # cedilha
'¹' => 0xB9, # elevado a 1
'º' => 0xBA, # ordinal masculino
'»' => 0xBB, # fecha-aspas europeu
'¼' => 0xBC, #
'½' => 0xBD, #
'¾' => 0xBE, #
'¿' => 0xBF, # interrogação invertida
'À' => 0xC0, #
'Á' => 0xC1, #
'Â' => 0xC2, #
'Ã' => 0xC3, #
'Ä' => 0xC4, #
'Å' => 0xC5, #
'Æ' => 0xC6, # ligadura AE
'Ç' => 0xC7, #
'È' => 0xC8, #
'É' => 0xC9, #
'Ê' => 0xCA, #
'Ë' => 0xCB, #
'Ì' => 0xCC, #
'Í' => 0xCD, #
'Î' => 0xCE, #
'Ï' => 0xCF, #
'Ð' => 0xD0, # Eth (islândico)
'Ñ' => 0xD1, #
'Ò' => 0xD2, #
'Ó' => 0xD3, #
'Ô' => 0xD4, #
'Õ' => 0xD5, #
'Ö' => 0xD6, #
'×' => 0xD7, # multiplicação
'Ø' => 0xD8, #
'Ù' => 0xD9, #
'Ú' => 0xDA, #
'Û' => 0xDB, #
'Ü' => 0xDC, #
'Ý' => 0xDD, #
'Þ' => 0xDE, # Thorn (islândico)
'ß' => 0xDF, # ligadura sz (alemão)
'à' => 0xE0, #
'á' => 0xE1, #
'â' => 0xE2, #
'ã' => 0xE3, #
'ä' => 0xE4, #
'å' => 0xE5, #
'æ' => 0xE6, # ligadura ae
'ç' => 0xE7, #
'è' => 0xE8, #
'é' => 0xE0, #
'ê' => 0xEA, #
'ë' => 0xEB, #
'ì' => 0xEC, #
'í' => 0xED, #
'î' => 0xEE, #
'ï' => 0xEF, #
'ð' => 0xF0, # eth (islândico)
'ñ' => 0xF1, #
'ò' => 0xF2, #
'ó' => 0xF3, #
'ô' => 0xF4, #
'õ' => 0xF5, #
'ö' => 0xF6, #
'÷' => 0xF7, # divisão
'ø' => 0xF8, #
'ù' => 0xF9, #
'ú' => 0xFA, #
'û' => 0xFB, #
'ü' => 0xFC, #
'ý' => 0xFD, #
'þ' => 0xFE, # thorn (islândico)
'ÿ' => 0xFF, #
)
# https://cs.stanford.edu/people/miles/iso8859.html

const DECODE_ISO_LATIN_1 = Dict{UInt8, Char}(
0xA0 => ' ', # non-breaking space
0xA1 => '¡', # exclamação invertida
0xA2 => '¢', #
0xA3 => '£', # libra
0xA4 => '¤', # moeda genérica
0xA5 => '¥', # yen
0xA6 => '¦', #
0xA7 => '§', # parágrafo
0xA8 => '¨', # trema
0xA9 => '©', # copyright
0xAA => 'ª', # ordinal feminino
0xAB => '«', # abre-aspas europeu
0xAC => '¬', # negação lógica
0xAD => '­' , # soft hyphen
0xAE => '®', # registrado
0xAF => '¯', # overscore
0xB0 => '°', # grau
0xB1 => '±', # mais ou menos
0xB2 => '²', # ao quadrado
0xB3 => '³', # ao cubo
0xB4 => '´', # acento agudo
0xB5 => 'µ', # micro, mü
0xB6 => '', # parágrafo
0xB7 => '·', # middle dot
0xB8 => '¸', # cedilha
0xB9 => '¹', # elevado a 1
0xBA => 'º', # ordinal masculino
0xBB => '»', # fecha-aspas europeu
0xBC => '¼', #
0xBD => '½', #
0xBE => '¾', #
0xBF => '¿', # interrogação invertida
0xC0 => 'À', #
0xC1 => 'Á', #
0xC2 => 'Â', #
0xC3 => 'Ã', #
0xC4 => 'Ä', #
0xC5 => 'Å', #
0xC6 => 'Æ', # ligadura AE
0xC7 => 'Ç', #
0xC8 => 'È', #
0xC9 => 'É', #
0xCA => 'Ê', #
0xCB => 'Ë', #
0xCC => 'Ì', #
0xCD => 'Í', #
0xCE => 'Î', #
0xCF => 'Ï', #
0xD0 => 'Ð', # Eth (islândico)
0xD1 => 'Ñ', #
0xD2 => 'Ò', #
0xD3 => 'Ó', #
0xD4 => 'Ô', #
0xD5 => 'Õ', #
0xD6 => 'Ö', #
0xD7 => '×', # multiplicação
0xD8 => 'Ø', #
0xD9 => 'Ù', #
0xDA => 'Ú', #
0xDB => 'Û', #
0xDC => 'Ü', #
0xDD => 'Ý', #
0xDE => 'Þ', # Thorn (islândico)
0xDF => 'ß', # ligadura sz (alemão)
0xE0 => 'à', #
0xE1 => 'á', #
0xE2 => 'â', #
0xE3 => 'ã', #
0xE4 => 'ä', #
0xE5 => 'å', #
0xE6 => 'æ', # ligadura ae
0xE7 => 'ç', #
0xE8 => 'è', #
0xE0 => 'é', #
0xEA => 'ê', #
0xEB => 'ë', #
0xEC => 'ì', #
0xED => 'í', #
0xEE => 'î', #
0xEF => 'ï', #
0xF0 => 'ð', # eth (islândico)
0xF1 => 'ñ', #
0xF2 => 'ò', #
0xF3 => 'ó', #
0xF4 => 'ô', #
0xF5 => 'õ', #
0xF6 => 'ö', #
0xF7 => '÷', # divisão
0xF8 => 'ø', #
0xF9 => 'ù', #
0xFA => 'ú', #
0xFB => 'û', #
0xFC => 'ü', #
0xFD => 'ý', #
0xFE => 'þ', # thorn (islândico)
0xFF => 'ÿ', #
)
# Special Characters
0xA0 => ' ', # reaking space
0xA1 => '¡', # inverted exclamation mark
0xA2 => '¢', # cent sign
0xA3 => '£', # pound sterling sign
0xA4 => '¤', # general currency sign
0xA5 => '¥', # yen sign
0xA6 => '¦', # &brvbar; broken vertical bar
0xA7 => '§', # section sign
0xA8 => '¨', # &die; spacing dieresis or umlaut
0xA9 => '©', # copyright sign
0xAA => 'ª', # feminine ordinal sign
0xAB => '«', # left double angle quote or guillemet
0xAC => '¬', # logical not sign
0xAD => '­', # soft hyphen
0xAE => '®', # registered trademark sign
0xAF => '¯', # &hibar; spacing macron long accent
0xB0 => '°', # degree sign
0xB1 => '±', # plus-or-minus sign
0xB2 => '²', # superscript 2
0xB3 => '³', # superscript 3
0xB4 => '´', # spacing accute accent
0xB5 => 'µ', # micro sign, mu
0xB6 => '', # paragraph sign, pilcrow sign
0xB7 => '·', # middle dot, centered dot
0xB8 => '¸', # spacing cedilla
0xB9 => '¹', # superscript 1
0xBA => 'º', # masculine ordinal indicator
0xBB => '»', # right double angle quote or guillemet
0xBC => '¼', # fraction 1/4
0xBD => '½', # &half; fraction 1/2
0xBE => '¾', # fraction 3/4
0xBF => '¿', # inverted question mark
# Upper Case Latin-1 Letters
0xC0 => 'À', # capital A grave
0xC1 => 'Á', # capital A acute
0xC2 => 'Â', # capital A circumflex
0xC3 => 'Ã', # capital A tilde
0xC4 => 'Ä', # capital A dieresis or umlaut
0xC5 => 'Å', # capital A ring
0xC6 => 'Æ', # capital AE ligature
0xC7 => 'Ç', # capital C cedilla
0xC8 => 'È', # capital E grave
0xC9 => 'É', # capital E acute
0xCA => 'Ê', # capital E circumflex
0xCB => 'Ë', # capital E dieresis or umlaut
0xCC => 'Ì', # capital I grave
0xCD => 'Í', # capital I acute
0xCE => 'Î', # capital I circumflex
0xCF => 'Ï', # capital I dieresis or umlaut
0xD0 => 'Ð', # capital ETH
0xD1 => 'Ñ', # capital N tilde
0xD2 => 'Ò', # capital O grave
0xD3 => 'Ó', # capital O acute
0xD4 => 'Ô', # capital O circumflex
0xD5 => 'Õ', # capital O tilde
0xD6 => 'Ö', # capital O dieresis or umlaut
0xD7 => '×', # multiplication sign
0xD8 => 'Ø', # capital O slash
0xD9 => 'Ù', # capital U grave
0xDA => 'Ú', # capital U acute
0xDB => 'Û', # capital U circumflex
0xDC => 'Ü', # capital U dieresis or umlaut
0xDD => 'Ý', # capital Y acute
0xDE => 'Þ', # capital THORN
0xDF => 'ß', # small sharp s, sz ligature
# Lower Case Latin-1 Letters
0xE0 => 'à', # small a grave
0xE1 => 'á', # small a acute
0xE2 => 'â', # small a circumflex
0xE3 => 'ã', # small a tilde
0xE4 => 'ä', # small a dieresis or umlaut
0xE5 => 'å', # small a ring
0xE6 => 'æ', # small ae ligature
0xE7 => 'ç', # small c cedilla
0xE8 => 'è', # small e grave
0xE9 => 'é', # small e acute
0xEA => 'ê', # small e circumflex
0xEB => 'ë', # small e dieresis or umlaut
0xEC => 'ì', # small i grave
0xED => 'í', # small i acute
0xEE => 'î', # small i circumflex
0xEF => 'ï', # small i dieresis or umlaut
0xF0 => 'ð', # small eth
0xF1 => 'ñ', # small n tilde
0xF2 => 'ò', # small o grave
0xF3 => 'ó', # small o acute
0xF4 => 'ô', # small o circumflex
0xF5 => 'õ', # small o tilde
0xF6 => 'ö', # small o dieresis or umlaut
0xF7 => '÷', # division sign
0xF8 => 'ø', # small o slash
0xF9 => 'ù', # small u grave
0xFA => 'ú', # small u acute
0xFB => 'û', # small u circumflex
0xFC => 'ü', # small u dieresis or umlaut
0xFD => 'ý', # small y acute
0xFE => 'þ', # small thorn
0xFF => 'ÿ', # small y dieresis or umlaut
)

const ENCODE_ISO_LATIN_1 = Dict{Char, UInt8}(char => code for (code, char) in DECODE_ISO_LATIN_1)
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ using Test
using Encodings

text = "Café com pão"
data = UInt8[0x43, 0x61, 0x66, 0xe0, 0x20, 0x63, 0x6f, 0x6d, 0x20, 0x70, 0xe3, 0x6f]
data = UInt8[0x43, 0x61, 0x66, 0xe9, 0x20, 0x63, 0x6f, 0x6d, 0x20, 0x70, 0xe3, 0x6f]

@testset "Encoding :: ISO-LATIN-1" begin
@test encode(text, Encodings.ISO_LATIN_1()) == data
Expand Down

2 comments on commit 8df53d8

@pedromxavier
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/80988

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.1 -m "<description of version>" 8df53d85e472a1dad75f20928399b0f67eb47df4
git push origin v0.1.1

Please sign in to comment.