diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md deleted file mode 100644 index 6c64b2c..0000000 --- a/docs/advanced_usage.md +++ /dev/null @@ -1,18 +0,0 @@ -# Advanced Usage - -The easiest way to get started is to use the :meth:`decode` method. - -```python ->>> import chardetng_py ->>> chardetng_py.decode(b'Jakby r\xeaka Boga') -'Jakby rêka Boga' -``` - -There is also a `detect` method available for compatability with `chardet`, -but it will always report `None` for the language and a confidence value of `0.99`. - -```python ->>> from chardetng_py.compat import detect ->>> detect(b'Jakby r\xeaka Boga') -{'encoding': 'cp1254', 'confidence': 0.99, 'language': None} -``` diff --git a/docs/class_reference.md b/docs/class_reference.md deleted file mode 100644 index 213a13c..0000000 --- a/docs/class_reference.md +++ /dev/null @@ -1,18 +0,0 @@ -# Class Reference - -This is the main python binding for chardetng, a rust character encoding detector for -legacy Web content. - -The class here is a wrapper around the rust struct `EncodingDetector`. The documentation -for the rust structure is available on [docs.rs](https://docs.rs/chardetng/). - -For more information about the overall function of the library, read Henri Sivonen's -[excellent write-up](https://hsivonen.fi/chardetng/). - -## chardetng_py.detector - -```{eval-rst} -.. automodule:: chardetng_py.detector - :members: - :undoc-members: -``` diff --git a/docs/class_reference.rst b/docs/class_reference.rst new file mode 100644 index 0000000..afb2a40 --- /dev/null +++ b/docs/class_reference.rst @@ -0,0 +1,19 @@ +Class Reference +=============== + +This is the main python binding for chardetng, a rust character encoding +detector for legacy Web content. + +The class here is a wrapper around the rust struct ``EncodingDetector``. +The documentation for the rust structure is available on +`docs.rs `__. + +For more information about the overall function of the library, read +Henri Sivonen’s `excellent write-up `__. + +chardetng_py.detector +--------------------- + +.. automodule:: chardetng_py.detector + :members: + :undoc-members: diff --git a/docs/codeofconduct.md b/docs/codeofconduct.md deleted file mode 100644 index 58fd373..0000000 --- a/docs/codeofconduct.md +++ /dev/null @@ -1,3 +0,0 @@ -```{include} ../CODE_OF_CONDUCT.md - -``` diff --git a/docs/codeofconduct.rst b/docs/codeofconduct.rst new file mode 100644 index 0000000..6af71d5 --- /dev/null +++ b/docs/codeofconduct.rst @@ -0,0 +1,3 @@ +\```{include} ../CODE_OF_CONDUCT.md + +\``\` diff --git a/docs/conf.py b/docs/conf.py index 4bf87af..0987583 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,7 +6,6 @@ "sphinx_toolbox.more_autodoc", "sphinx.ext.autodoc", "sphinx.ext.napoleon", - "myst_parser", "numpydoc", ] autodoc_typehints = "description" diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index b941964..0000000 --- a/docs/contributing.md +++ /dev/null @@ -1,7 +0,0 @@ -```{include} ../CONTRIBUTING.md ---- -end-before: ---- -``` - -[code of conduct]: codeofconduct diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..046cffe --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,7 @@ ++----------------------------------+ +| \```{include} ../CONTRIBUTING.md | ++==================================+ +| end-before: | ++----------------------------------+ + +\``\` diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 912bf88..0000000 --- a/docs/index.md +++ /dev/null @@ -1,26 +0,0 @@ -```{include} ../README.md ---- -end-before: ---- -``` - -[license]: license -[contributor guide]: contributing -[command-line reference]: usage - -```{toctree} ---- -hidden: -maxdepth: 1 ---- - -usage -shortcuts -reference -class_reference -recipes -contributing -Code of Conduct -License -Changelog -``` diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..2427d7b --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,105 @@ +chardetng_py +============ + +|PyPI| |Status| |Python Version| |License| + +|Read the documentation at https://chardetng-py.readthedocs.io/| |Tests| + +|pre-commit| |Black| + +Features +-------- + +Python binding for the +`chardetng `__ character encoding +detector. + +Installation +------------ + +You can install ``chardetng_py`` via `pip `__ from +`PyPI `__: + +.. code:: console + + $ pip install chardetng-py + +Or via poetry: + +.. code:: console + + $ poetry add chardetng-py + +Quick Start +----------- + +The easiest way to get started is to use the :meth:``detect`` method. + +.. code:: python + + >>> from chardetng_py import detect + >>> detect(b'Jakby r\xeaka Boga') + 'windows-1254' + +There is also a ``detect`` method available for compatability with +``chardet``, but it will always report ``None`` for the language and a +confidence value of ``0.99``. + +.. code:: python + + >>> from chardetng_py.compat import detect + >>> detect(b'Jakby r\xeaka Boga') + {'encoding': 'windows-1254', 'confidence': 0.99, 'language': None} + +Contributing +------------ + +Contributions are very welcome. To learn more, see the `Contributor +Guide `__. + +License +------- + +Distributed under the terms of the `MIT +license `__, +``chardetng_py`` is free and open source software. + +Issues +------ + +If you encounter any problems, please `file an +issue `__ along with +a detailed description. + +Credits +------- + +.. raw:: html + + + +.. |PyPI| image:: https://img.shields.io/pypi/v/chardetng-py.svg + :target: https://pypi.org/project/chardetng-py/ +.. |Status| image:: https://img.shields.io/pypi/status/chardetng-py.svg + :target: https://pypi.org/project/chardetng-py/ +.. |Python Version| image:: https://img.shields.io/pypi/pyversions/chardetng-py + :target: https://pypi.org/project/chardetng-py +.. |License| image:: https://img.shields.io/pypi/l/chardetng-py + :target: https://github.com/john-parton/chardetng-py/blob/main/LICENSE +.. |Read the documentation at https://chardetng-py.readthedocs.io/| image:: https://img.shields.io/readthedocs/chardetng-py/latest.svg?label=Read%20the%20Docs + :target: https://chardetng-py.readthedocs.io/ +.. |Tests| image:: https://github.com/john-parton/chardetng-py/workflows/Tests/badge.svg + :target: https://github.com/john-parton/chardetng-py/actions?workflow=Tests +.. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white + :target: https://github.com/pre-commit/pre-commit +.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/psf/black + +.. toctree:: + :maxdepth: 0 + :hidden: + + usage + shortcuts + class_reference + recipes diff --git a/docs/license.md b/docs/license.md deleted file mode 100644 index 218790f..0000000 --- a/docs/license.md +++ /dev/null @@ -1,7 +0,0 @@ -# License - -```{literalinclude} ../LICENSE ---- -language: none ---- -``` diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 0000000..0daf08e --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,10 @@ +License +======= + ++---------------------------------+ +| \```{literalinclude} ../LICENSE | ++=================================+ +| language: none | ++---------------------------------+ + +\``\` diff --git a/docs/recipes.md b/docs/recipes.md deleted file mode 100644 index 1ef4462..0000000 --- a/docs/recipes.md +++ /dev/null @@ -1,111 +0,0 @@ -# Recipes - -These are some additional possible uses for chardetng_py. - -If there's sufficient interest, we can stabilise these and include them in the -main package. - -## Detect the encoding of a bytestring and return a CodecInfo object - -```python -def detect_codec( - byte_str: Union[bytes, bytearray], *, allow_utf8: bool = True -) -> codecs.CodecInfo: - r"""Detect the encoding of byte_str and return a CodecInfo object. - - Parameters - ---------- - byte_str : bytes or bytearray - Input buffer to detect the encoding of. - - Examples - -------- - >>> codec = detect_codec(b"Jakby r\xeaka Boga") - >>> codec.name - 'cp1254' - - """ - - return codecs.lookup(detect(byte_str, allow_utf8=allow_utf8)) -``` - -## Detect the encoding of a bytestring and return the decoded string - -```python -def decode( - byte_str: Union[bytes, bytearray], - errors: Literal[ - "strict", "ignore", "replace", "backslashreplace", "surrogateescape" - ] = "strict", - *, - allow_utf8: bool = True, -) -> str: - r"""Detect the encoding of byte_str and return the decoded string. - - Parameters - ---------- - byte_str : bytes or bytearray - Input buffer to decode. - errors: "strict" or "ignore" or "replace" or "backslashreplace" or "surrogateescape" - Error handler to use. See [Python documentation](https://docs.python.org/3/library/codecs.html#error-handlers) - - Examples - -------- - >>> decode(b"Jakby r\xeaka Boga") - 'Jakby rêka Boga' - - """ - return byte_str.decode(detect(byte_str, allow_utf8=allow_utf8), errors=errors) -``` - -## Open a file, incrementally determine its encoding and return a TextIOWrapper - -This is a neat trick that allows you to open a file and detect its encoding with -a fixed amount of memory. The other bindings I've found don't support this use-case -and you end up having to read the entire file into memory, which is a problem for -huge files. - -This also lets you directly pass a text file of unknown encoding to csv.writer of -csv.DictWriter, for example. - -```python -# Reads entire file -# We could add support for reading to some fixed position -def _detect_buffer(buffer: IO[bytes], *, allow_utf8: bool = True, **kwargs): - cursor_initial_position = buffer.tell() - - encoding_detector = EncodingDetector() - - # Not sure this is the best chunk size? - while chunk := buffer.read(io.DEFAULT_BUFFER_SIZE): - encoding_detector.feed(chunk, last=False) - - encoding_detector.feed(b"", last=True) - - buffer.seek(cursor_initial_position) - - return io.TextIOWrapper( - buffer, - encoding=encoding_detector.guess(tld=None, allow_utf8=allow_utf8), - **kwargs, - ) - - -# Could be nice to have an async one as well -# unfortunately async fs tools aren't in std lib -@contextmanager -def detect_open( - file: Union[bytes, str, PathLike], mode: Literal["r", "rt"] = "r", **kwargs -): - """Open a file and detect its encoding.""" - if mode not in {"r", "rt"}: - raise NotImplemented("Only reading supported at the moment") - # TODO Could support r+ and w+ modes of operation? - - # The whole point is that we're going to detect in - if "encoding" in kwargs: - raise ValueError - - with open(file, mode="rb", **kwargs) as f: - yield _detect_buffer(f) -``` diff --git a/docs/recipes.rst b/docs/recipes.rst new file mode 100644 index 0000000..dcc8d53 --- /dev/null +++ b/docs/recipes.rst @@ -0,0 +1,115 @@ +Recipes +======= + +These are some additional possible uses for chardetng_py. + +If there’s sufficient interest, we can stabilise these and include them +in the main package. + +Detect the encoding of a bytestring and return a CodecInfo object +----------------------------------------------------------------- + +.. code:: python + + def detect_codec( + byte_str: Union[bytes, bytearray], *, allow_utf8: bool = True + ) -> codecs.CodecInfo: + r"""Detect the encoding of byte_str and return a CodecInfo object. + + Parameters + ---------- + byte_str : bytes or bytearray + Input buffer to detect the encoding of. + + Examples + -------- + >>> codec = detect_codec(b"Jakby r\xeaka Boga") + >>> codec.name + 'cp1254' + + """ + + return codecs.lookup(detect(byte_str, allow_utf8=allow_utf8)) + +Detect the encoding of a bytestring and return the decoded string +----------------------------------------------------------------- + +.. code:: python + + def decode( + byte_str: Union[bytes, bytearray], + errors: Literal[ + "strict", "ignore", "replace", "backslashreplace", "surrogateescape" + ] = "strict", + *, + allow_utf8: bool = True, + ) -> str: + r"""Detect the encoding of byte_str and return the decoded string. + + Parameters + ---------- + byte_str : bytes or bytearray + Input buffer to decode. + errors: "strict" or "ignore" or "replace" or "backslashreplace" or "surrogateescape" + Error handler to use. See [Python documentation](https://docs.python.org/3/library/codecs.html#error-handlers) + + Examples + -------- + >>> decode(b"Jakby r\xeaka Boga") + 'Jakby rêka Boga' + + """ + return byte_str.decode(detect(byte_str, allow_utf8=allow_utf8), errors=errors) + +Open a file, incrementally determine its encoding and return a TextIOWrapper +---------------------------------------------------------------------------- + +This is a neat trick that allows you to open a file and detect its +encoding with a fixed amount of memory. The other bindings I’ve found +don’t support this use-case and you end up having to read the entire +file into memory, which is a problem for huge files. + +This also lets you directly pass a text file of unknown encoding to +csv.writer of csv.DictWriter, for example. + +.. code:: python + + # Reads entire file + # We could add support for reading to some fixed position + def _detect_buffer(buffer: IO[bytes], *, allow_utf8: bool = True, **kwargs): + cursor_initial_position = buffer.tell() + + encoding_detector = EncodingDetector() + + # Not sure this is the best chunk size? + while chunk := buffer.read(io.DEFAULT_BUFFER_SIZE): + encoding_detector.feed(chunk, last=False) + + encoding_detector.feed(b"", last=True) + + buffer.seek(cursor_initial_position) + + return io.TextIOWrapper( + buffer, + encoding=encoding_detector.guess(tld=None, allow_utf8=allow_utf8), + **kwargs, + ) + + + # Could be nice to have an async one as well + # unfortunately async fs tools aren't in std lib + @contextmanager + def detect_open( + file: Union[bytes, str, PathLike], mode: Literal["r", "rt"] = "r", **kwargs + ): + """Open a file and detect its encoding.""" + if mode not in {"r", "rt"}: + raise NotImplemented("Only reading supported at the moment") + # TODO Could support r+ and w+ modes of operation? + + # The whole point is that we're going to detect in + if "encoding" in kwargs: + raise ValueError + + with open(file, mode="rb", **kwargs) as f: + yield _detect_buffer(f) diff --git a/docs/requirements.txt b/docs/requirements.txt index 8778166..819b0e1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ furo==2023.8.19 sphinx==7.2.3 -myst_parser==2.0.0 numpydoc==1.5.0 sphinx_toolbox diff --git a/docs/shortcuts.md b/docs/shortcuts.rst similarity index 63% rename from docs/shortcuts.md rename to docs/shortcuts.rst index 88bca8d..bc13bc8 100644 --- a/docs/shortcuts.md +++ b/docs/shortcuts.rst @@ -1,6 +1,5 @@ -# Shortcuts +Shortcuts +========= -```{eval-rst} .. automodule:: chardetng_py.shortcuts :members: -``` diff --git a/docs/usage.md b/docs/usage.md deleted file mode 100644 index ab5459c..0000000 --- a/docs/usage.md +++ /dev/null @@ -1,32 +0,0 @@ -# Usage - -## Basic Usage - -The easiest way to get started is to use the `detect` method. - -```python ->>> from chardetng_py import detect ->>> detect(b'Jakby r\xeaka Boga') -'windows-1254' -``` - -There is also a `detect` method available for compatability with `chardet`, -but it will always report `None` for the language and a confidence value of `0.99`. - -```python ->>> from chardetng_py.compat import detect ->>> detect(b'Jakby r\xeaka Boga') -{'encoding': 'windows-1254', 'confidence': 0.99, 'language': None} -``` - -## Advanced Usage - -It is also possible to use the `EncodingDetector` class directly. - -```python ->>> from chardetng_py import EncodingDetector ->>> detector = EncodingDetector() ->>> detector.feed(b'Jakby r\xeaka Boga', last=True) ->>> detector.guess(tld=None, allow_utf8=True) -'windows-1254' -``` diff --git a/docs/usage.rst b/docs/usage.rst new file mode 100644 index 0000000..9b91386 --- /dev/null +++ b/docs/usage.rst @@ -0,0 +1,36 @@ +Usage +===== + +Basic Usage +----------- + +The easiest way to get started is to use the ``detect`` method. + +.. code:: python + + >>> from chardetng_py import detect + >>> detect(b'Jakby r\xeaka Boga') + 'windows-1254' + +There is also a ``detect`` method available for compatability with +``chardet``, but it will always report ``None`` for the language and a +confidence value of ``0.99``. + +.. code:: python + + >>> from chardetng_py.compat import detect + >>> detect(b'Jakby r\xeaka Boga') + {'encoding': 'windows-1254', 'confidence': 0.99, 'language': None} + +Advanced Usage +-------------- + +It is also possible to use the ``EncodingDetector`` class directly. + +.. code:: python + + >>> from chardetng_py import EncodingDetector + >>> detector = EncodingDetector() + >>> detector.feed(b'Jakby r\xeaka Boga', last=True) + >>> detector.guess(tld=None, allow_utf8=True) + 'windows-1254' diff --git a/poetry.lock b/poetry.lock index 77a5bc1..e5f2768 100644 --- a/poetry.lock +++ b/poetry.lock @@ -633,30 +633,6 @@ files = [ six = "*" tornado = {version = "*", markers = "python_version > \"2.7\""} -[[package]] -name = "markdown-it-py" -version = "3.0.0" -description = "Python port of markdown-it. Markdown parsing, done right!" -optional = false -python-versions = ">=3.8" -files = [ - {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, - {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, -] - -[package.dependencies] -mdurl = ">=0.1,<1.0" - -[package.extras] -benchmarking = ["psutil", "pytest", "pytest-benchmark"] -code-style = ["pre-commit (>=3.0,<4.0)"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] -linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins"] -profiling = ["gprof2dot"] -rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - [[package]] name = "markupsafe" version = "2.1.3" @@ -745,36 +721,6 @@ tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} patchelf = ["patchelf"] zig = ["ziglang (>=0.10.0,<0.11.0)"] -[[package]] -name = "mdit-py-plugins" -version = "0.4.0" -description = "Collection of plugins for markdown-it-py" -optional = false -python-versions = ">=3.8" -files = [ - {file = "mdit_py_plugins-0.4.0-py3-none-any.whl", hash = "sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9"}, - {file = "mdit_py_plugins-0.4.0.tar.gz", hash = "sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b"}, -] - -[package.dependencies] -markdown-it-py = ">=1.0.0,<4.0.0" - -[package.extras] -code-style = ["pre-commit"] -rtd = ["myst-parser", "sphinx-book-theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - -[[package]] -name = "mdurl" -version = "0.1.2" -description = "Markdown URL utilities" -optional = false -python-versions = ">=3.7" -files = [ - {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, - {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, -] - [[package]] name = "msgpack" version = "1.0.5" @@ -904,32 +850,6 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] -[[package]] -name = "myst-parser" -version = "2.0.0" -description = "An extended [CommonMark](https://spec.commonmark.org/) compliant parser," -optional = false -python-versions = ">=3.8" -files = [ - {file = "myst_parser-2.0.0-py3-none-any.whl", hash = "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14"}, - {file = "myst_parser-2.0.0.tar.gz", hash = "sha256:ea929a67a6a0b1683cdbe19b8d2e724cd7643f8aa3e7bb18dd65beac3483bead"}, -] - -[package.dependencies] -docutils = ">=0.16,<0.21" -jinja2 = "*" -markdown-it-py = ">=3.0,<4.0" -mdit-py-plugins = ">=0.4,<1.0" -pyyaml = "*" -sphinx = ">=6,<8" - -[package.extras] -code-style = ["pre-commit (>=3.0,<4.0)"] -linkify = ["linkify-it-py (>=2.0,<3.0)"] -rtd = ["ipython", "pydata-sphinx-theme (==v0.13.0rc4)", "sphinx-autodoc2 (>=0.4.2,<0.5.0)", "sphinx-book-theme (==1.0.0rc2)", "sphinx-copybutton", "sphinx-design2", "sphinx-pyscript", "sphinx-tippy (>=0.3.1)", "sphinx-togglebutton", "sphinxext-opengraph (>=0.8.2,<0.9.0)", "sphinxext-rediraffe (>=0.2.7,<0.3.0)"] -testing = ["beautifulsoup4", "coverage[toml]", "pytest (>=7,<8)", "pytest-cov", "pytest-param-files (>=0.3.4,<0.4.0)", "pytest-regressions", "sphinx-pytest"] -testing-docutils = ["pygments", "pytest (>=7,<8)", "pytest-param-files (>=0.3.4,<0.4.0)"] - [[package]] name = "natsort" version = "8.4.0" @@ -1784,4 +1704,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "cb247b195fafefc4571d78e1c56c85d28c7e990d5f218e1fdc65ebd77d335d4a" +content-hash = "beef479df69e75eef4e07c30270ee1470035fc3f6d4554ce7724dff733735478" diff --git a/pyproject.toml b/pyproject.toml index 7e2549f..dd6ce48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ Changelog = "https://github.com/john-parton/chardetng-py/releases" [tool.poetry.dependencies] python = "^3.8" -myst-parser = "^2.0.0" [tool.poetry.dev-dependencies] numpydoc = "^1.5.0"