diff --git a/Cargo.toml b/Cargo.toml index 7aeb582..f153d37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "chardetng_py" -version = "0.3.0" +version = "0.3.1" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/pyproject.toml b/pyproject.toml index dd6ce48..72bc146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" [tool.poetry] name = "chardetng-py" -version = "0.3.0" +version = "0.3.1" description = "chardetng Python Module" authors = ["John Parton "] license = "MIT" diff --git a/python/chardetng_py/shortcuts.py b/python/chardetng_py/shortcuts.py index 03bd0c4..dfcb4a1 100644 --- a/python/chardetng_py/shortcuts.py +++ b/python/chardetng_py/shortcuts.py @@ -17,7 +17,12 @@ """ -def detect(byte_str: Union[bytes, bytearray], *, allow_utf8: bool = False) -> str: +def detect( + byte_str: Union[bytes, bytearray], + *, + allow_utf8: bool = False, + tld: Union[bytes, bytearray, None] = None, +) -> str: """Detect the encoding of :code:`byte_str`. Returned encoding is suitable for use with :code:`str.decode`. @@ -34,11 +39,17 @@ def detect(byte_str: Union[bytes, bytearray], *, allow_utf8: bool = False) -> st unless the user has taken a specific contextual action to request an override. This way, Web developers cannot start depending on UTF-8 detection. Such reliance would make the Web Platform more brittle. + tld : :code:`bytes` or :code:`bytearray` or :code:`None` + If :code:`tld` contains non-ASCII, period, or upper-case letters. The exception + condition is intentionally limited to signs of failing to extract the + label correctly, failing to provide it in its Punycode form, and failure + to lower-case it. Full DNS label validation is intentionally not performed + to avoid panics when the reality doesn't match the specs. """ encoding_detector = EncodingDetector() encoding_detector.feed(byte_str, last=True) - encoding: str = encoding_detector.guess(tld=None, allow_utf8=allow_utf8) + encoding: str = encoding_detector.guess(tld=tld, allow_utf8=allow_utf8) # chardetng uses 'windows-874' as an encoding, which Python does not understand # I believe that windows-874 and cp874 are basically the same encoding