Skip to content

Commit

Permalink
Expose tld argument in shortcut function.
Browse files Browse the repository at this point in the history
  • Loading branch information
john-parton committed Aug 27, 2023
1 parent e7be130 commit d995ee0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "chardetng_py"
version = "0.3.0"
version = "0.3.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ readme = "README.md"

[tool.poetry]
name = "chardetng-py"
version = "0.3.0"
version = "0.3.1"
description = "chardetng Python Module"
authors = ["John Parton <[email protected]>"]
license = "MIT"
Expand Down
15 changes: 13 additions & 2 deletions python/chardetng_py/shortcuts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
"""


def detect(byte_str: Union[bytes, bytearray], *, allow_utf8: bool = False) -> str:
def detect(
byte_str: Union[bytes, bytearray],
*,
allow_utf8: bool = False,
tld: Union[bytes, bytearray, None] = None,
) -> str:
"""Detect the encoding of :code:`byte_str`.
Returned encoding is suitable for use with :code:`str.decode`.
Expand All @@ -34,11 +39,17 @@ def detect(byte_str: Union[bytes, bytearray], *, allow_utf8: bool = False) -> st
unless the user has taken a specific contextual action to request an
override. This way, Web developers cannot start depending on UTF-8
detection. Such reliance would make the Web Platform more brittle.
tld : :code:`bytes` or :code:`bytearray` or :code:`None`
If :code:`tld` contains non-ASCII, period, or upper-case letters. The exception
condition is intentionally limited to signs of failing to extract the
label correctly, failing to provide it in its Punycode form, and failure
to lower-case it. Full DNS label validation is intentionally not performed
to avoid panics when the reality doesn't match the specs.
"""
encoding_detector = EncodingDetector()
encoding_detector.feed(byte_str, last=True)

encoding: str = encoding_detector.guess(tld=None, allow_utf8=allow_utf8)
encoding: str = encoding_detector.guess(tld=tld, allow_utf8=allow_utf8)

# chardetng uses 'windows-874' as an encoding, which Python does not understand
# I believe that windows-874 and cp874 are basically the same encoding
Expand Down

0 comments on commit d995ee0

Please sign in to comment.