rudra0410hf commited on
Commit
ff4c600
·
verified ·
1 Parent(s): a7031fb

Delete env

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +0 -1
  2. env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +0 -20
  3. env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +0 -46
  4. env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +0 -43
  5. env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +0 -5
  6. env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +0 -2
  7. env/Lib/site-packages/_yaml/__init__.py +0 -33
  8. env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +0 -1
  9. env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +0 -20
  10. env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +0 -77
  11. env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +0 -14
  12. env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +0 -5
  13. env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +0 -1
  14. env/Lib/site-packages/certifi/__init__.py +0 -4
  15. env/Lib/site-packages/certifi/__main__.py +0 -12
  16. env/Lib/site-packages/certifi/cacert.pem +0 -0
  17. env/Lib/site-packages/certifi/core.py +0 -114
  18. env/Lib/site-packages/certifi/py.typed +0 -0
  19. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +0 -1
  20. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +0 -21
  21. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +0 -721
  22. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +0 -35
  23. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +0 -5
  24. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +0 -2
  25. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +0 -1
  26. env/Lib/site-packages/charset_normalizer/__init__.py +0 -48
  27. env/Lib/site-packages/charset_normalizer/__main__.py +0 -6
  28. env/Lib/site-packages/charset_normalizer/api.py +0 -668
  29. env/Lib/site-packages/charset_normalizer/cd.py +0 -395
  30. env/Lib/site-packages/charset_normalizer/cli/__init__.py +0 -8
  31. env/Lib/site-packages/charset_normalizer/cli/__main__.py +0 -321
  32. env/Lib/site-packages/charset_normalizer/constant.py +0 -1998
  33. env/Lib/site-packages/charset_normalizer/legacy.py +0 -66
  34. env/Lib/site-packages/charset_normalizer/md.py +0 -630
  35. env/Lib/site-packages/charset_normalizer/models.py +0 -360
  36. env/Lib/site-packages/charset_normalizer/py.typed +0 -0
  37. env/Lib/site-packages/charset_normalizer/utils.py +0 -408
  38. env/Lib/site-packages/charset_normalizer/version.py +0 -8
  39. env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +0 -1
  40. env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +0 -441
  41. env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +0 -31
  42. env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +0 -5
  43. env/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt +0 -27
  44. env/Lib/site-packages/colorama/__init__.py +0 -7
  45. env/Lib/site-packages/colorama/ansi.py +0 -102
  46. env/Lib/site-packages/colorama/ansitowin32.py +0 -277
  47. env/Lib/site-packages/colorama/initialise.py +0 -121
  48. env/Lib/site-packages/colorama/tests/__init__.py +0 -1
  49. env/Lib/site-packages/colorama/tests/ansi_test.py +0 -76
  50. env/Lib/site-packages/colorama/tests/ansitowin32_test.py +0 -294
env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER DELETED
@@ -1 +0,0 @@
1
- pip
 
 
env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE DELETED
@@ -1,20 +0,0 @@
1
- Copyright (c) 2017-2021 Ingy döt Net
2
- Copyright (c) 2006-2016 Kirill Simonov
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy of
5
- this software and associated documentation files (the "Software"), to deal in
6
- the Software without restriction, including without limitation the rights to
7
- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8
- of the Software, and to permit persons to whom the Software is furnished to do
9
- so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in all
12
- copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA DELETED
@@ -1,46 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: PyYAML
3
- Version: 6.0.2
4
- Summary: YAML parser and emitter for Python
5
- Home-page: https://pyyaml.org/
6
- Download-URL: https://pypi.org/project/PyYAML/
7
- Author: Kirill Simonov
8
- Author-email: [email protected]
9
- License: MIT
10
- Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
11
- Project-URL: CI, https://github.com/yaml/pyyaml/actions
12
- Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
13
- Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
14
- Project-URL: Source Code, https://github.com/yaml/pyyaml
15
- Platform: Any
16
- Classifier: Development Status :: 5 - Production/Stable
17
- Classifier: Intended Audience :: Developers
18
- Classifier: License :: OSI Approved :: MIT License
19
- Classifier: Operating System :: OS Independent
20
- Classifier: Programming Language :: Cython
21
- Classifier: Programming Language :: Python
22
- Classifier: Programming Language :: Python :: 3
23
- Classifier: Programming Language :: Python :: 3.8
24
- Classifier: Programming Language :: Python :: 3.9
25
- Classifier: Programming Language :: Python :: 3.10
26
- Classifier: Programming Language :: Python :: 3.11
27
- Classifier: Programming Language :: Python :: 3.12
28
- Classifier: Programming Language :: Python :: 3.13
29
- Classifier: Programming Language :: Python :: Implementation :: CPython
30
- Classifier: Programming Language :: Python :: Implementation :: PyPy
31
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
32
- Classifier: Topic :: Text Processing :: Markup
33
- Requires-Python: >=3.8
34
- License-File: LICENSE
35
-
36
- YAML is a data serialization format designed for human readability
37
- and interaction with scripting languages. PyYAML is a YAML parser
38
- and emitter for Python.
39
-
40
- PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
41
- support, capable extension API, and sensible error messages. PyYAML
42
- supports standard YAML tags and provides Python-specific tags that
43
- allow to represent an arbitrary Python object.
44
-
45
- PyYAML is applicable for a broad range of tasks from complex
46
- configuration files to object serialization and persistence.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD DELETED
@@ -1,43 +0,0 @@
1
- PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
- PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
3
- PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
4
- PyYAML-6.0.2.dist-info/RECORD,,
5
- PyYAML-6.0.2.dist-info/WHEEL,sha256=c7SWG1_hRvc9HXHEkmWlTu1Jr4WpzRucfzqTP-_8q0s,102
6
- PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
7
- _yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
8
- _yaml/__pycache__/__init__.cpython-312.pyc,,
9
- yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
10
- yaml/__pycache__/__init__.cpython-312.pyc,,
11
- yaml/__pycache__/composer.cpython-312.pyc,,
12
- yaml/__pycache__/constructor.cpython-312.pyc,,
13
- yaml/__pycache__/cyaml.cpython-312.pyc,,
14
- yaml/__pycache__/dumper.cpython-312.pyc,,
15
- yaml/__pycache__/emitter.cpython-312.pyc,,
16
- yaml/__pycache__/error.cpython-312.pyc,,
17
- yaml/__pycache__/events.cpython-312.pyc,,
18
- yaml/__pycache__/loader.cpython-312.pyc,,
19
- yaml/__pycache__/nodes.cpython-312.pyc,,
20
- yaml/__pycache__/parser.cpython-312.pyc,,
21
- yaml/__pycache__/reader.cpython-312.pyc,,
22
- yaml/__pycache__/representer.cpython-312.pyc,,
23
- yaml/__pycache__/resolver.cpython-312.pyc,,
24
- yaml/__pycache__/scanner.cpython-312.pyc,,
25
- yaml/__pycache__/serializer.cpython-312.pyc,,
26
- yaml/__pycache__/tokens.cpython-312.pyc,,
27
- yaml/_yaml.cp312-win_amd64.pyd,sha256=Bx7e_LEQx7cnd1_A9_nClp3X77g-_Lw1aoAAtYZbwWk,263680
28
- yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
29
- yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
30
- yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
31
- yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
32
- yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
33
- yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
34
- yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
35
- yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
36
- yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
37
- yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
38
- yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
39
- yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
40
- yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
41
- yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
42
- yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
43
- yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL DELETED
@@ -1,5 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
3
- Root-Is-Purelib: false
4
- Tag: cp312-cp312-win_amd64
5
-
 
 
 
 
 
 
env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt DELETED
@@ -1,2 +0,0 @@
1
- _yaml
2
- yaml
 
 
 
env/Lib/site-packages/_yaml/__init__.py DELETED
@@ -1,33 +0,0 @@
1
- # This is a stub package designed to roughly emulate the _yaml
2
- # extension module, which previously existed as a standalone module
3
- # and has been moved into the `yaml` package namespace.
4
- # It does not perfectly mimic its old counterpart, but should get
5
- # close enough for anyone who's relying on it even when they shouldn't.
6
- import yaml
7
-
8
- # in some circumstances, the yaml module we imoprted may be from a different version, so we need
9
- # to tread carefully when poking at it here (it may not have the attributes we expect)
10
- if not getattr(yaml, '__with_libyaml__', False):
11
- from sys import version_info
12
-
13
- exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
14
- raise exc("No module named '_yaml'")
15
- else:
16
- from yaml._yaml import *
17
- import warnings
18
- warnings.warn(
19
- 'The _yaml extension module is now located at yaml._yaml'
20
- ' and its location is subject to change. To use the'
21
- ' LibYAML-based parser and emitter, import from `yaml`:'
22
- ' `from yaml import CLoader as Loader, CDumper as Dumper`.',
23
- DeprecationWarning
24
- )
25
- del warnings
26
- # Don't `del yaml` here because yaml is actually an existing
27
- # namespace member of _yaml.
28
-
29
- __name__ = '_yaml'
30
- # If the module is top-level (i.e. not a part of any specific package)
31
- # then the attribute should be set to ''.
32
- # https://docs.python.org/3.8/library/types.html
33
- __package__ = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER DELETED
@@ -1 +0,0 @@
1
- pip
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE DELETED
@@ -1,20 +0,0 @@
1
- This package contains a modified version of ca-bundle.crt:
2
-
3
- ca-bundle.crt -- Bundle of CA Root Certificates
4
-
5
- This is a bundle of X.509 certificates of public Certificate Authorities
6
- (CA). These were automatically extracted from Mozilla's root certificates
7
- file (certdata.txt). This file can be found in the mozilla source tree:
8
- https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
9
- It contains the certificates in PEM format and therefore
10
- can be directly used with curl / libcurl / php_curl, or with
11
- an Apache+mod_ssl webserver for SSL client authentication.
12
- Just configure this file as the SSLCACertificateFile.#
13
-
14
- ***** BEGIN LICENSE BLOCK *****
15
- This Source Code Form is subject to the terms of the Mozilla Public License,
16
- v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
17
- one at http://mozilla.org/MPL/2.0/.
18
-
19
- ***** END LICENSE BLOCK *****
20
- @(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA DELETED
@@ -1,77 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: certifi
3
- Version: 2025.1.31
4
- Summary: Python package for providing Mozilla's CA Bundle.
5
- Home-page: https://github.com/certifi/python-certifi
6
- Author: Kenneth Reitz
7
- Author-email: [email protected]
8
- License: MPL-2.0
9
- Project-URL: Source, https://github.com/certifi/python-certifi
10
- Classifier: Development Status :: 5 - Production/Stable
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
- Classifier: Natural Language :: English
14
- Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3
16
- Classifier: Programming Language :: Python :: 3 :: Only
17
- Classifier: Programming Language :: Python :: 3.6
18
- Classifier: Programming Language :: Python :: 3.7
19
- Classifier: Programming Language :: Python :: 3.8
20
- Classifier: Programming Language :: Python :: 3.9
21
- Classifier: Programming Language :: Python :: 3.10
22
- Classifier: Programming Language :: Python :: 3.11
23
- Classifier: Programming Language :: Python :: 3.12
24
- Classifier: Programming Language :: Python :: 3.13
25
- Requires-Python: >=3.6
26
- License-File: LICENSE
27
- Dynamic: author
28
- Dynamic: author-email
29
- Dynamic: classifier
30
- Dynamic: description
31
- Dynamic: home-page
32
- Dynamic: license
33
- Dynamic: project-url
34
- Dynamic: requires-python
35
- Dynamic: summary
36
-
37
- Certifi: Python SSL Certificates
38
- ================================
39
-
40
- Certifi provides Mozilla's carefully curated collection of Root Certificates for
41
- validating the trustworthiness of SSL certificates while verifying the identity
42
- of TLS hosts. It has been extracted from the `Requests`_ project.
43
-
44
- Installation
45
- ------------
46
-
47
- ``certifi`` is available on PyPI. Simply install it with ``pip``::
48
-
49
- $ pip install certifi
50
-
51
- Usage
52
- -----
53
-
54
- To reference the installed certificate authority (CA) bundle, you can use the
55
- built-in function::
56
-
57
- >>> import certifi
58
-
59
- >>> certifi.where()
60
- '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
61
-
62
- Or from the command line::
63
-
64
- $ python -m certifi
65
- /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
66
-
67
- Enjoy!
68
-
69
- .. _`Requests`: https://requests.readthedocs.io/en/master/
70
-
71
- Addition/Removal of Certificates
72
- --------------------------------
73
-
74
- Certifi does not support any addition/removal or other modification of the
75
- CA trust store content. This project is intended to provide a reliable and
76
- highly portable root of trust to python deployments. Look to upstream projects
77
- for methods to use alternate trust.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD DELETED
@@ -1,14 +0,0 @@
1
- certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
- certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
3
- certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
4
- certifi-2025.1.31.dist-info/RECORD,,
5
- certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
- certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
- certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
8
- certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
- certifi/__pycache__/__init__.cpython-312.pyc,,
10
- certifi/__pycache__/__main__.cpython-312.pyc,,
11
- certifi/__pycache__/core.cpython-312.pyc,,
12
- certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
13
- certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
14
- certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL DELETED
@@ -1,5 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
3
- Root-Is-Purelib: true
4
- Tag: py3-none-any
5
-
 
 
 
 
 
 
env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
- certifi
 
 
env/Lib/site-packages/certifi/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .core import contents, where
2
-
3
- __all__ = ["contents", "where"]
4
- __version__ = "2025.01.31"
 
 
 
 
 
env/Lib/site-packages/certifi/__main__.py DELETED
@@ -1,12 +0,0 @@
1
- import argparse
2
-
3
- from certifi import contents, where
4
-
5
- parser = argparse.ArgumentParser()
6
- parser.add_argument("-c", "--contents", action="store_true")
7
- args = parser.parse_args()
8
-
9
- if args.contents:
10
- print(contents())
11
- else:
12
- print(where())
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi/cacert.pem DELETED
The diff for this file is too large to render. See raw diff
 
env/Lib/site-packages/certifi/core.py DELETED
@@ -1,114 +0,0 @@
1
- """
2
- certifi.py
3
- ~~~~~~~~~~
4
-
5
- This module returns the installation location of cacert.pem or its contents.
6
- """
7
- import sys
8
- import atexit
9
-
10
- def exit_cacert_ctx() -> None:
11
- _CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
12
-
13
-
14
- if sys.version_info >= (3, 11):
15
-
16
- from importlib.resources import as_file, files
17
-
18
- _CACERT_CTX = None
19
- _CACERT_PATH = None
20
-
21
- def where() -> str:
22
- # This is slightly terrible, but we want to delay extracting the file
23
- # in cases where we're inside of a zipimport situation until someone
24
- # actually calls where(), but we don't want to re-extract the file
25
- # on every call of where(), so we'll do it once then store it in a
26
- # global variable.
27
- global _CACERT_CTX
28
- global _CACERT_PATH
29
- if _CACERT_PATH is None:
30
- # This is slightly janky, the importlib.resources API wants you to
31
- # manage the cleanup of this file, so it doesn't actually return a
32
- # path, it returns a context manager that will give you the path
33
- # when you enter it and will do any cleanup when you leave it. In
34
- # the common case of not needing a temporary file, it will just
35
- # return the file system location and the __exit__() is a no-op.
36
- #
37
- # We also have to hold onto the actual context manager, because
38
- # it will do the cleanup whenever it gets garbage collected, so
39
- # we will also store that at the global level as well.
40
- _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
41
- _CACERT_PATH = str(_CACERT_CTX.__enter__())
42
- atexit.register(exit_cacert_ctx)
43
-
44
- return _CACERT_PATH
45
-
46
- def contents() -> str:
47
- return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
48
-
49
- elif sys.version_info >= (3, 7):
50
-
51
- from importlib.resources import path as get_path, read_text
52
-
53
- _CACERT_CTX = None
54
- _CACERT_PATH = None
55
-
56
- def where() -> str:
57
- # This is slightly terrible, but we want to delay extracting the
58
- # file in cases where we're inside of a zipimport situation until
59
- # someone actually calls where(), but we don't want to re-extract
60
- # the file on every call of where(), so we'll do it once then store
61
- # it in a global variable.
62
- global _CACERT_CTX
63
- global _CACERT_PATH
64
- if _CACERT_PATH is None:
65
- # This is slightly janky, the importlib.resources API wants you
66
- # to manage the cleanup of this file, so it doesn't actually
67
- # return a path, it returns a context manager that will give
68
- # you the path when you enter it and will do any cleanup when
69
- # you leave it. In the common case of not needing a temporary
70
- # file, it will just return the file system location and the
71
- # __exit__() is a no-op.
72
- #
73
- # We also have to hold onto the actual context manager, because
74
- # it will do the cleanup whenever it gets garbage collected, so
75
- # we will also store that at the global level as well.
76
- _CACERT_CTX = get_path("certifi", "cacert.pem")
77
- _CACERT_PATH = str(_CACERT_CTX.__enter__())
78
- atexit.register(exit_cacert_ctx)
79
-
80
- return _CACERT_PATH
81
-
82
- def contents() -> str:
83
- return read_text("certifi", "cacert.pem", encoding="ascii")
84
-
85
- else:
86
- import os
87
- import types
88
- from typing import Union
89
-
90
- Package = Union[types.ModuleType, str]
91
- Resource = Union[str, "os.PathLike"]
92
-
93
- # This fallback will work for Python versions prior to 3.7 that lack the
94
- # importlib.resources module but relies on the existing `where` function
95
- # so won't address issues with environments like PyOxidizer that don't set
96
- # __file__ on modules.
97
- def read_text(
98
- package: Package,
99
- resource: Resource,
100
- encoding: str = 'utf-8',
101
- errors: str = 'strict'
102
- ) -> str:
103
- with open(where(), encoding=encoding) as data:
104
- return data.read()
105
-
106
- # If we don't have importlib.resources, then we will just do the old logic
107
- # of assuming we're on the filesystem and munge the path directly.
108
- def where() -> str:
109
- f = os.path.dirname(__file__)
110
-
111
- return os.path.join(f, "cacert.pem")
112
-
113
- def contents() -> str:
114
- return read_text("certifi", "cacert.pem", encoding="ascii")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/certifi/py.typed DELETED
File without changes
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER DELETED
@@ -1 +0,0 @@
1
- pip
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 TAHRI Ahmed R.
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA DELETED
@@ -1,721 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: charset-normalizer
3
- Version: 3.4.1
4
- Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
- Author-email: "Ahmed R. TAHRI" <[email protected]>
6
- Maintainer-email: "Ahmed R. TAHRI" <[email protected]>
7
- License: MIT
8
- Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
- Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
- Project-URL: Code, https://github.com/jawah/charset_normalizer
11
- Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
- Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
- Classifier: Development Status :: 5 - Production/Stable
14
- Classifier: Intended Audience :: Developers
15
- Classifier: License :: OSI Approved :: MIT License
16
- Classifier: Operating System :: OS Independent
17
- Classifier: Programming Language :: Python
18
- Classifier: Programming Language :: Python :: 3
19
- Classifier: Programming Language :: Python :: 3.7
20
- Classifier: Programming Language :: Python :: 3.8
21
- Classifier: Programming Language :: Python :: 3.9
22
- Classifier: Programming Language :: Python :: 3.10
23
- Classifier: Programming Language :: Python :: 3.11
24
- Classifier: Programming Language :: Python :: 3.12
25
- Classifier: Programming Language :: Python :: 3.13
26
- Classifier: Programming Language :: Python :: 3 :: Only
27
- Classifier: Programming Language :: Python :: Implementation :: CPython
28
- Classifier: Programming Language :: Python :: Implementation :: PyPy
29
- Classifier: Topic :: Text Processing :: Linguistic
30
- Classifier: Topic :: Utilities
31
- Classifier: Typing :: Typed
32
- Requires-Python: >=3.7
33
- Description-Content-Type: text/markdown
34
- License-File: LICENSE
35
- Provides-Extra: unicode-backport
36
-
37
- <h1 align="center">Charset Detection, for Everyone 👋</h1>
38
-
39
- <p align="center">
40
- <sup>The Real First Universal Charset Detector</sup><br>
41
- <a href="https://pypi.org/project/charset-normalizer">
42
- <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
43
- </a>
44
- <a href="https://pepy.tech/project/charset-normalizer/">
45
- <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
46
- </a>
47
- <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
48
- <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
49
- </a>
50
- </p>
51
- <p align="center">
52
- <sup><i>Featured Packages</i></sup><br>
53
- <a href="https://github.com/jawah/niquests">
54
- <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
55
- </a>
56
- <a href="https://github.com/jawah/wassima">
57
- <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
58
- </a>
59
- </p>
60
- <p align="center">
61
- <sup><i>In other language (unofficial port - by the community)</i></sup><br>
62
- <a href="https://github.com/nickspring/charset-normalizer-rs">
63
- <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
64
- </a>
65
- </p>
66
-
67
- > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
68
- > I'm trying to resolve the issue by taking a new approach.
69
- > All IANA character set names for which the Python core library provides codecs are supported.
70
-
71
- <p align="center">
72
- >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
73
- </p>
74
-
75
- This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
76
-
77
- | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
78
- |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
79
- | `Fast` | ❌ | ✅ | ✅ |
80
- | `Universal**` | ❌ | ✅ | ❌ |
81
- | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
82
- | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
83
- | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
84
- | `Native Python` | ✅ | ✅ | ❌ |
85
- | `Detect spoken language` | ❌ | ✅ | N/A |
86
- | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
87
- | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
88
- | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
89
-
90
- <p align="center">
91
- <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
92
- </p>
93
-
94
- *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
95
-
96
- ## ⚡ Performance
97
-
98
- This package offer better performance than its counterpart Chardet. Here are some numbers.
99
-
100
- | Package | Accuracy | Mean per file (ms) | File per sec (est) |
101
- |-----------------------------------------------|:--------:|:------------------:|:------------------:|
102
- | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
103
- | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
104
-
105
- | Package | 99th percentile | 95th percentile | 50th percentile |
106
- |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
107
- | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
108
- | charset-normalizer | 100 ms | 50 ms | 5 ms |
109
-
110
- _updated as of december 2024 using CPython 3.12_
111
-
112
- Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
113
-
114
- > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
115
- > And yes, these results might change at any time. The dataset can be updated to include more files.
116
- > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
117
- > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
118
- > (e.g. Supported Encoding) Challenge-them if you want.
119
-
120
- ## ✨ Installation
121
-
122
- Using pip:
123
-
124
- ```sh
125
- pip install charset-normalizer -U
126
- ```
127
-
128
- ## 🚀 Basic Usage
129
-
130
- ### CLI
131
- This package comes with a CLI.
132
-
133
- ```
134
- usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
135
- file [file ...]
136
-
137
- The Real First Universal Charset Detector. Discover originating encoding used
138
- on text file. Normalize text to unicode.
139
-
140
- positional arguments:
141
- files File(s) to be analysed
142
-
143
- optional arguments:
144
- -h, --help show this help message and exit
145
- -v, --verbose Display complementary information about file if any.
146
- Stdout will contain logs about the detection process.
147
- -a, --with-alternative
148
- Output complementary possibilities if any. Top-level
149
- JSON WILL be a list.
150
- -n, --normalize Permit to normalize input file. If not set, program
151
- does not write anything.
152
- -m, --minimal Only output the charset detected to STDOUT. Disabling
153
- JSON output.
154
- -r, --replace Replace file when trying to normalize it instead of
155
- creating a new one.
156
- -f, --force Replace file without asking if you are sure, use this
157
- flag with caution.
158
- -t THRESHOLD, --threshold THRESHOLD
159
- Define a custom maximum amount of chaos allowed in
160
- decoded content. 0. <= chaos <= 1.
161
- --version Show version information and exit.
162
- ```
163
-
164
- ```bash
165
- normalizer ./data/sample.1.fr.srt
166
- ```
167
-
168
- or
169
-
170
- ```bash
171
- python -m charset_normalizer ./data/sample.1.fr.srt
172
- ```
173
-
174
- 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
175
-
176
- ```json
177
- {
178
- "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
179
- "encoding": "cp1252",
180
- "encoding_aliases": [
181
- "1252",
182
- "windows_1252"
183
- ],
184
- "alternative_encodings": [
185
- "cp1254",
186
- "cp1256",
187
- "cp1258",
188
- "iso8859_14",
189
- "iso8859_15",
190
- "iso8859_16",
191
- "iso8859_3",
192
- "iso8859_9",
193
- "latin_1",
194
- "mbcs"
195
- ],
196
- "language": "French",
197
- "alphabets": [
198
- "Basic Latin",
199
- "Latin-1 Supplement"
200
- ],
201
- "has_sig_or_bom": false,
202
- "chaos": 0.149,
203
- "coherence": 97.152,
204
- "unicode_path": null,
205
- "is_preferred": true
206
- }
207
- ```
208
-
209
- ### Python
210
- *Just print out normalized text*
211
- ```python
212
- from charset_normalizer import from_path
213
-
214
- results = from_path('./my_subtitle.srt')
215
-
216
- print(str(results.best()))
217
- ```
218
-
219
- *Upgrade your code without effort*
220
- ```python
221
- from charset_normalizer import detect
222
- ```
223
-
224
- The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
225
-
226
- See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
227
-
228
- ## 😇 Why
229
-
230
- When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
231
- reliable alternative using a completely different method. Also! I never back down on a good challenge!
232
-
233
- I **don't care** about the **originating charset** encoding, because **two different tables** can
234
- produce **two identical rendered string.**
235
- What I want is to get readable text, the best I can.
236
-
237
- In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
238
-
239
- Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
240
-
241
- ## 🍰 How
242
-
243
- - Discard all charset encoding table that could not fit the binary content.
244
- - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
245
- - Extract matches with the lowest mess detected.
246
- - Additionally, we measure coherence / probe for a language.
247
-
248
- **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
249
-
250
- *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
251
- **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
252
- I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
253
- improve or rewrite it.
254
-
255
- *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
256
- that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
257
-
258
- ## ⚡ Known limitations
259
-
260
- - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
261
- - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
262
-
263
- ## ⚠️ About Python EOLs
264
-
265
- **If you are running:**
266
-
267
- - Python >=2.7,<3.5: Unsupported
268
- - Python 3.5: charset-normalizer < 2.1
269
- - Python 3.6: charset-normalizer < 3.1
270
- - Python 3.7: charset-normalizer < 4.0
271
-
272
- Upgrade your Python interpreter as soon as possible.
273
-
274
- ## 👤 Contributing
275
-
276
- Contributions, issues and feature requests are very much welcome.<br />
277
- Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
278
-
279
- ## 📝 License
280
-
281
- Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
282
- This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
283
-
284
- Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
285
-
286
- ## 💼 For Enterprise
287
-
288
- Professional support for charset-normalizer is available as part of the [Tidelift
289
- Subscription][1]. Tidelift gives software development teams a single source for
290
- purchasing and maintaining their software, with professional grade assurances
291
- from the experts who know it best, while seamlessly integrating with existing
292
- tools.
293
-
294
- [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
295
-
296
- [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
297
-
298
- # Changelog
299
- All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
300
- The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
301
-
302
- ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
303
-
304
- ### Changed
305
- - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
306
- - Enforce annotation delayed loading for a simpler and consistent types in the project.
307
- - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
308
-
309
- ### Added
310
- - pre-commit configuration.
311
- - noxfile.
312
-
313
- ### Removed
314
- - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
315
- - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
316
- - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
317
- - Unused `utils.range_scan` function.
318
-
319
- ### Fixed
320
- - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
321
- - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
322
-
323
- ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
324
-
325
- ### Added
326
- - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
327
- - Support for Python 3.13 (#512)
328
-
329
- ### Fixed
330
- - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
331
- - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
332
- - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
333
-
334
- ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
335
-
336
- ### Fixed
337
- - Unintentional memory usage regression when using large payload that match several encoding (#376)
338
- - Regression on some detection case showcased in the documentation (#371)
339
-
340
- ### Added
341
- - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
342
-
343
- ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
344
-
345
- ### Changed
346
- - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
347
- - Improved the general detection reliability based on reports from the community
348
-
349
- ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
350
-
351
- ### Added
352
- - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
353
- - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
354
-
355
- ### Removed
356
- - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
357
- - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
358
-
359
- ### Changed
360
- - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
361
- - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
362
-
363
- ### Fixed
364
- - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
365
-
366
- ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
367
-
368
- ### Changed
369
- - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
370
- - Minor improvement over the global detection reliability
371
-
372
- ### Added
373
- - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
374
- - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
375
- - Explicit support for Python 3.12
376
-
377
- ### Fixed
378
- - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
379
-
380
- ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
381
-
382
- ### Added
383
- - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
384
-
385
- ### Removed
386
- - Support for Python 3.6 (PR #260)
387
-
388
- ### Changed
389
- - Optional speedup provided by mypy/c 1.0.1
390
-
391
- ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
392
-
393
- ### Fixed
394
- - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
395
-
396
- ### Changed
397
- - Speedup provided by mypy/c 0.990 on Python >= 3.7
398
-
399
- ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
400
-
401
- ### Added
402
- - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
403
- - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
404
- - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
405
- - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
406
-
407
- ### Changed
408
- - Build with static metadata using 'build' frontend
409
- - Make the language detection stricter
410
- - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
411
-
412
- ### Fixed
413
- - CLI with opt --normalize fail when using full path for files
414
- - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
415
- - Sphinx warnings when generating the documentation
416
-
417
- ### Removed
418
- - Coherence detector no longer return 'Simple English' instead return 'English'
419
- - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
420
- - Breaking: Method `first()` and `best()` from CharsetMatch
421
- - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
422
- - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
423
- - Breaking: Top-level function `normalize`
424
- - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
425
- - Support for the backport `unicodedata2`
426
-
427
- ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
428
-
429
- ### Added
430
- - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
431
- - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
432
- - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
433
-
434
- ### Changed
435
- - Build with static metadata using 'build' frontend
436
- - Make the language detection stricter
437
-
438
- ### Fixed
439
- - CLI with opt --normalize fail when using full path for files
440
- - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
441
-
442
- ### Removed
443
- - Coherence detector no longer return 'Simple English' instead return 'English'
444
- - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
445
-
446
- ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
447
-
448
- ### Added
449
- - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
450
-
451
- ### Removed
452
- - Breaking: Method `first()` and `best()` from CharsetMatch
453
- - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
454
-
455
- ### Fixed
456
- - Sphinx warnings when generating the documentation
457
-
458
- ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
459
-
460
- ### Changed
461
- - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
462
-
463
- ### Removed
464
- - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
465
- - Breaking: Top-level function `normalize`
466
- - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
467
- - Support for the backport `unicodedata2`
468
-
469
- ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
470
-
471
- ### Deprecated
472
- - Function `normalize` scheduled for removal in 3.0
473
-
474
- ### Changed
475
- - Removed useless call to decode in fn is_unprintable (#206)
476
-
477
- ### Fixed
478
- - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
479
-
480
- ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
481
-
482
- ### Added
483
- - Output the Unicode table version when running the CLI with `--version` (PR #194)
484
-
485
- ### Changed
486
- - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
487
- - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
488
-
489
- ### Fixed
490
- - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
491
- - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
492
-
493
- ### Removed
494
- - Support for Python 3.5 (PR #192)
495
-
496
- ### Deprecated
497
- - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
498
-
499
- ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
500
-
501
- ### Fixed
502
- - ASCII miss-detection on rare cases (PR #170)
503
-
504
- ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
505
-
506
- ### Added
507
- - Explicit support for Python 3.11 (PR #164)
508
-
509
- ### Changed
510
- - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
511
-
512
- ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
513
-
514
- ### Fixed
515
- - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
516
-
517
- ### Changed
518
- - Skipping the language-detection (CD) on ASCII (PR #155)
519
-
520
- ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
521
-
522
- ### Changed
523
- - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
524
-
525
- ### Fixed
526
- - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
527
-
528
- ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
529
- ### Changed
530
- - Improvement over Vietnamese detection (PR #126)
531
- - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
532
- - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
533
- - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
534
- - Code style as refactored by Sourcery-AI (PR #131)
535
- - Minor adjustment on the MD around european words (PR #133)
536
- - Remove and replace SRTs from assets / tests (PR #139)
537
- - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
538
- - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
539
-
540
- ### Fixed
541
- - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
542
- - Avoid using too insignificant chunk (PR #137)
543
-
544
- ### Added
545
- - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
546
- - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
547
-
548
- ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
549
- ### Added
550
- - Add support for Kazakh (Cyrillic) language detection (PR #109)
551
-
552
- ### Changed
553
- - Further, improve inferring the language from a given single-byte code page (PR #112)
554
- - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
555
- - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
556
- - Various detection improvement (MD+CD) (PR #117)
557
-
558
- ### Removed
559
- - Remove redundant logging entry about detected language(s) (PR #115)
560
-
561
- ### Fixed
562
- - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
563
-
564
- ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
565
- ### Fixed
566
- - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
567
- - Fix CLI crash when using --minimal output in certain cases (PR #103)
568
-
569
- ### Changed
570
- - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
571
-
572
- ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
573
- ### Changed
574
- - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
575
- - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
576
- - The Unicode detection is slightly improved (PR #93)
577
- - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
578
-
579
- ### Removed
580
- - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
581
-
582
- ### Fixed
583
- - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
584
- - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
585
- - The MANIFEST.in was not exhaustive (PR #78)
586
-
587
- ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
588
- ### Fixed
589
- - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
590
- - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
591
- - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
592
- - Submatch factoring could be wrong in rare edge cases (PR #72)
593
- - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
594
- - Fix line endings from CRLF to LF for certain project files (PR #67)
595
-
596
- ### Changed
597
- - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
598
- - Allow fallback on specified encoding if any (PR #71)
599
-
600
- ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
601
- ### Changed
602
- - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
603
- - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
604
-
605
- ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
606
- ### Fixed
607
- - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
608
-
609
- ### Changed
610
- - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
611
-
612
- ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
613
- ### Fixed
614
- - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
615
- - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
616
- - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
617
- - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
618
-
619
- ### Changed
620
- - Public function normalize default args values were not aligned with from_bytes (PR #53)
621
-
622
- ### Added
623
- - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
624
-
625
- ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
626
- ### Changed
627
- - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
628
- - Accent has been made on UTF-8 detection, should perform rather instantaneous.
629
- - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
630
- - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
631
- - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
632
- - utf_7 detection has been reinstated.
633
-
634
- ### Removed
635
- - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
636
- - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
637
- - The exception hook on UnicodeDecodeError has been removed.
638
-
639
- ### Deprecated
640
- - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
641
-
642
- ### Fixed
643
- - The CLI output used the relative path of the file(s). Should be absolute.
644
-
645
- ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
646
- ### Fixed
647
- - Logger configuration/usage no longer conflict with others (PR #44)
648
-
649
- ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
650
- ### Removed
651
- - Using standard logging instead of using the package loguru.
652
- - Dropping nose test framework in favor of the maintained pytest.
653
- - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
654
- - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
655
- - Stop support for UTF-7 that does not contain a SIG.
656
- - Dropping PrettyTable, replaced with pure JSON output in CLI.
657
-
658
- ### Fixed
659
- - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
660
- - Not searching properly for the BOM when trying utf32/16 parent codec.
661
-
662
- ### Changed
663
- - Improving the package final size by compressing frequencies.json.
664
- - Huge improvement over the larges payload.
665
-
666
- ### Added
667
- - CLI now produces JSON consumable output.
668
- - Return ASCII if given sequences fit. Given reasonable confidence.
669
-
670
- ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
671
-
672
- ### Fixed
673
- - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
674
-
675
- ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
676
-
677
- ### Fixed
678
- - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
679
-
680
- ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
681
-
682
- ### Fixed
683
- - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
684
-
685
- ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
686
-
687
- ### Changed
688
- - Amend the previous release to allow prettytable 2.0 (PR #35)
689
-
690
- ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
691
-
692
- ### Fixed
693
- - Fix error while using the package with a python pre-release interpreter (PR #33)
694
-
695
- ### Changed
696
- - Dependencies refactoring, constraints revised.
697
-
698
- ### Added
699
- - Add python 3.9 and 3.10 to the supported interpreters
700
-
701
- MIT License
702
-
703
- Copyright (c) 2025 TAHRI Ahmed R.
704
-
705
- Permission is hereby granted, free of charge, to any person obtaining a copy
706
- of this software and associated documentation files (the "Software"), to deal
707
- in the Software without restriction, including without limitation the rights
708
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
709
- copies of the Software, and to permit persons to whom the Software is
710
- furnished to do so, subject to the following conditions:
711
-
712
- The above copyright notice and this permission notice shall be included in all
713
- copies or substantial portions of the Software.
714
-
715
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
716
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
717
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
718
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
719
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
720
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
721
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD DELETED
@@ -1,35 +0,0 @@
1
- ../../Scripts/normalizer.exe,sha256=aGyf7WAVLi4gHrr8F-d9-4fQG9ifpfMEXEvLwyt8KjI,108411
2
- charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
- charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
4
- charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
5
- charset_normalizer-3.4.1.dist-info/RECORD,,
6
- charset_normalizer-3.4.1.dist-info/WHEEL,sha256=pWXrJbnZSH-J-PhYmKs2XNn4DHCPNBYq965vsBJBFvA,101
7
- charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
8
- charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
- charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
10
- charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
11
- charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
12
- charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
13
- charset_normalizer/__pycache__/api.cpython-312.pyc,,
14
- charset_normalizer/__pycache__/cd.cpython-312.pyc,,
15
- charset_normalizer/__pycache__/constant.cpython-312.pyc,,
16
- charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
17
- charset_normalizer/__pycache__/md.cpython-312.pyc,,
18
- charset_normalizer/__pycache__/models.cpython-312.pyc,,
19
- charset_normalizer/__pycache__/utils.cpython-312.pyc,,
20
- charset_normalizer/__pycache__/version.cpython-312.pyc,,
21
- charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
22
- charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
23
- charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
24
- charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
25
- charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
26
- charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
27
- charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
28
- charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
29
- charset_normalizer/md.cp312-win_amd64.pyd,sha256=XBGy--IKda7c3iBfvw_dovocqb2RSucmVtxvtlG_3tA,10752
30
- charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
31
- charset_normalizer/md__mypyc.cp312-win_amd64.pyd,sha256=_-jWSji0BgBVvrIHbmabYQNMBF4-xTusdO5mu6P8JsA,125440
32
- charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
33
- charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
35
- charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL DELETED
@@ -1,5 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
3
- Root-Is-Purelib: false
4
- Tag: cp312-cp312-win_amd64
5
-
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt DELETED
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- normalizer = charset_normalizer:cli.cli_detect
 
 
 
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
- charset_normalizer
 
 
env/Lib/site-packages/charset_normalizer/__init__.py DELETED
@@ -1,48 +0,0 @@
1
- """
2
- Charset-Normalizer
3
- ~~~~~~~~~~~~~~
4
- The Real First Universal Charset Detector.
5
- A library that helps you read text from an unknown charset encoding.
6
- Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
- All IANA character set names for which the Python core library provides codecs are supported.
8
-
9
- Basic usage:
10
- >>> from charset_normalizer import from_bytes
11
- >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
- >>> best_guess = results.best()
13
- >>> str(best_guess)
14
- 'Bсеки човек има право на образование. Oбразованието!'
15
-
16
- Others methods and usages are available - see the full documentation
17
- at <https://github.com/Ousret/charset_normalizer>.
18
- :copyright: (c) 2021 by Ahmed TAHRI
19
- :license: MIT, see LICENSE for more details.
20
- """
21
-
22
- from __future__ import annotations
23
-
24
- import logging
25
-
26
- from .api import from_bytes, from_fp, from_path, is_binary
27
- from .legacy import detect
28
- from .models import CharsetMatch, CharsetMatches
29
- from .utils import set_logging_handler
30
- from .version import VERSION, __version__
31
-
32
- __all__ = (
33
- "from_fp",
34
- "from_path",
35
- "from_bytes",
36
- "is_binary",
37
- "detect",
38
- "CharsetMatch",
39
- "CharsetMatches",
40
- "__version__",
41
- "VERSION",
42
- "set_logging_handler",
43
- )
44
-
45
- # Attach a NullHandler to the top level logger by default
46
- # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
-
48
- logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/__main__.py DELETED
@@ -1,6 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from .cli import cli_detect
4
-
5
- if __name__ == "__main__":
6
- cli_detect()
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/api.py DELETED
@@ -1,668 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from os import PathLike
5
- from typing import BinaryIO
6
-
7
- from .cd import (
8
- coherence_ratio,
9
- encoding_languages,
10
- mb_encoding_languages,
11
- merge_coherence_ratios,
12
- )
13
- from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
- from .md import mess_ratio
15
- from .models import CharsetMatch, CharsetMatches
16
- from .utils import (
17
- any_specified_encoding,
18
- cut_sequence_chunks,
19
- iana_name,
20
- identify_sig_or_bom,
21
- is_cp_similar,
22
- is_multi_byte_encoding,
23
- should_strip_sig_or_bom,
24
- )
25
-
26
- logger = logging.getLogger("charset_normalizer")
27
- explain_handler = logging.StreamHandler()
28
- explain_handler.setFormatter(
29
- logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
- )
31
-
32
-
33
- def from_bytes(
34
- sequences: bytes | bytearray,
35
- steps: int = 5,
36
- chunk_size: int = 512,
37
- threshold: float = 0.2,
38
- cp_isolation: list[str] | None = None,
39
- cp_exclusion: list[str] | None = None,
40
- preemptive_behaviour: bool = True,
41
- explain: bool = False,
42
- language_threshold: float = 0.1,
43
- enable_fallback: bool = True,
44
- ) -> CharsetMatches:
45
- """
46
- Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
- If there is no results, it is a strong indicator that the source is binary/not text.
48
- By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
- And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
-
51
- The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
- but never take it for granted. Can improve the performance.
53
-
54
- You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
- purpose.
56
-
57
- This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
- By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
- toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
- Custom logging format and handler can be set manually.
61
- """
62
-
63
- if not isinstance(sequences, (bytearray, bytes)):
64
- raise TypeError(
65
- "Expected object of type bytes or bytearray, got: {}".format(
66
- type(sequences)
67
- )
68
- )
69
-
70
- if explain:
71
- previous_logger_level: int = logger.level
72
- logger.addHandler(explain_handler)
73
- logger.setLevel(TRACE)
74
-
75
- length: int = len(sequences)
76
-
77
- if length == 0:
78
- logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
- if explain: # Defensive: ensure exit path clean handler
80
- logger.removeHandler(explain_handler)
81
- logger.setLevel(previous_logger_level or logging.WARNING)
82
- return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
-
84
- if cp_isolation is not None:
85
- logger.log(
86
- TRACE,
87
- "cp_isolation is set. use this flag for debugging purpose. "
88
- "limited list of encoding allowed : %s.",
89
- ", ".join(cp_isolation),
90
- )
91
- cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
- else:
93
- cp_isolation = []
94
-
95
- if cp_exclusion is not None:
96
- logger.log(
97
- TRACE,
98
- "cp_exclusion is set. use this flag for debugging purpose. "
99
- "limited list of encoding excluded : %s.",
100
- ", ".join(cp_exclusion),
101
- )
102
- cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
- else:
104
- cp_exclusion = []
105
-
106
- if length <= (chunk_size * steps):
107
- logger.log(
108
- TRACE,
109
- "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
- steps,
111
- chunk_size,
112
- length,
113
- )
114
- steps = 1
115
- chunk_size = length
116
-
117
- if steps > 1 and length / steps < chunk_size:
118
- chunk_size = int(length / steps)
119
-
120
- is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
- is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
-
123
- if is_too_small_sequence:
124
- logger.log(
125
- TRACE,
126
- "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
- length
128
- ),
129
- )
130
- elif is_too_large_sequence:
131
- logger.log(
132
- TRACE,
133
- "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
- length
135
- ),
136
- )
137
-
138
- prioritized_encodings: list[str] = []
139
-
140
- specified_encoding: str | None = (
141
- any_specified_encoding(sequences) if preemptive_behaviour else None
142
- )
143
-
144
- if specified_encoding is not None:
145
- prioritized_encodings.append(specified_encoding)
146
- logger.log(
147
- TRACE,
148
- "Detected declarative mark in sequence. Priority +1 given for %s.",
149
- specified_encoding,
150
- )
151
-
152
- tested: set[str] = set()
153
- tested_but_hard_failure: list[str] = []
154
- tested_but_soft_failure: list[str] = []
155
-
156
- fallback_ascii: CharsetMatch | None = None
157
- fallback_u8: CharsetMatch | None = None
158
- fallback_specified: CharsetMatch | None = None
159
-
160
- results: CharsetMatches = CharsetMatches()
161
-
162
- early_stop_results: CharsetMatches = CharsetMatches()
163
-
164
- sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
-
166
- if sig_encoding is not None:
167
- prioritized_encodings.append(sig_encoding)
168
- logger.log(
169
- TRACE,
170
- "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
- len(sig_payload),
172
- sig_encoding,
173
- )
174
-
175
- prioritized_encodings.append("ascii")
176
-
177
- if "utf_8" not in prioritized_encodings:
178
- prioritized_encodings.append("utf_8")
179
-
180
- for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
- if cp_isolation and encoding_iana not in cp_isolation:
182
- continue
183
-
184
- if cp_exclusion and encoding_iana in cp_exclusion:
185
- continue
186
-
187
- if encoding_iana in tested:
188
- continue
189
-
190
- tested.add(encoding_iana)
191
-
192
- decoded_payload: str | None = None
193
- bom_or_sig_available: bool = sig_encoding == encoding_iana
194
- strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
- encoding_iana
196
- )
197
-
198
- if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
- logger.log(
200
- TRACE,
201
- "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
- encoding_iana,
203
- )
204
- continue
205
- if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
- logger.log(
207
- TRACE,
208
- "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
- encoding_iana,
210
- )
211
- continue
212
-
213
- try:
214
- is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
- except (ModuleNotFoundError, ImportError):
216
- logger.log(
217
- TRACE,
218
- "Encoding %s does not provide an IncrementalDecoder",
219
- encoding_iana,
220
- )
221
- continue
222
-
223
- try:
224
- if is_too_large_sequence and is_multi_byte_decoder is False:
225
- str(
226
- (
227
- sequences[: int(50e4)]
228
- if strip_sig_or_bom is False
229
- else sequences[len(sig_payload) : int(50e4)]
230
- ),
231
- encoding=encoding_iana,
232
- )
233
- else:
234
- decoded_payload = str(
235
- (
236
- sequences
237
- if strip_sig_or_bom is False
238
- else sequences[len(sig_payload) :]
239
- ),
240
- encoding=encoding_iana,
241
- )
242
- except (UnicodeDecodeError, LookupError) as e:
243
- if not isinstance(e, LookupError):
244
- logger.log(
245
- TRACE,
246
- "Code page %s does not fit given bytes sequence at ALL. %s",
247
- encoding_iana,
248
- str(e),
249
- )
250
- tested_but_hard_failure.append(encoding_iana)
251
- continue
252
-
253
- similar_soft_failure_test: bool = False
254
-
255
- for encoding_soft_failed in tested_but_soft_failure:
256
- if is_cp_similar(encoding_iana, encoding_soft_failed):
257
- similar_soft_failure_test = True
258
- break
259
-
260
- if similar_soft_failure_test:
261
- logger.log(
262
- TRACE,
263
- "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
- encoding_iana,
265
- encoding_soft_failed,
266
- )
267
- continue
268
-
269
- r_ = range(
270
- 0 if not bom_or_sig_available else len(sig_payload),
271
- length,
272
- int(length / steps),
273
- )
274
-
275
- multi_byte_bonus: bool = (
276
- is_multi_byte_decoder
277
- and decoded_payload is not None
278
- and len(decoded_payload) < length
279
- )
280
-
281
- if multi_byte_bonus:
282
- logger.log(
283
- TRACE,
284
- "Code page %s is a multi byte encoding table and it appear that at least one character "
285
- "was encoded using n-bytes.",
286
- encoding_iana,
287
- )
288
-
289
- max_chunk_gave_up: int = int(len(r_) / 4)
290
-
291
- max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
- early_stop_count: int = 0
293
- lazy_str_hard_failure = False
294
-
295
- md_chunks: list[str] = []
296
- md_ratios = []
297
-
298
- try:
299
- for chunk in cut_sequence_chunks(
300
- sequences,
301
- encoding_iana,
302
- r_,
303
- chunk_size,
304
- bom_or_sig_available,
305
- strip_sig_or_bom,
306
- sig_payload,
307
- is_multi_byte_decoder,
308
- decoded_payload,
309
- ):
310
- md_chunks.append(chunk)
311
-
312
- md_ratios.append(
313
- mess_ratio(
314
- chunk,
315
- threshold,
316
- explain is True and 1 <= len(cp_isolation) <= 2,
317
- )
318
- )
319
-
320
- if md_ratios[-1] >= threshold:
321
- early_stop_count += 1
322
-
323
- if (early_stop_count >= max_chunk_gave_up) or (
324
- bom_or_sig_available and strip_sig_or_bom is False
325
- ):
326
- break
327
- except (
328
- UnicodeDecodeError
329
- ) as e: # Lazy str loading may have missed something there
330
- logger.log(
331
- TRACE,
332
- "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
- encoding_iana,
334
- str(e),
335
- )
336
- early_stop_count = max_chunk_gave_up
337
- lazy_str_hard_failure = True
338
-
339
- # We might want to check the sequence again with the whole content
340
- # Only if initial MD tests passes
341
- if (
342
- not lazy_str_hard_failure
343
- and is_too_large_sequence
344
- and not is_multi_byte_decoder
345
- ):
346
- try:
347
- sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
- except UnicodeDecodeError as e:
349
- logger.log(
350
- TRACE,
351
- "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
- encoding_iana,
353
- str(e),
354
- )
355
- tested_but_hard_failure.append(encoding_iana)
356
- continue
357
-
358
- mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
- if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
- tested_but_soft_failure.append(encoding_iana)
361
- logger.log(
362
- TRACE,
363
- "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
- "Computed mean chaos is %f %%.",
365
- encoding_iana,
366
- early_stop_count,
367
- round(mean_mess_ratio * 100, ndigits=3),
368
- )
369
- # Preparing those fallbacks in case we got nothing.
370
- if (
371
- enable_fallback
372
- and encoding_iana in ["ascii", "utf_8", specified_encoding]
373
- and not lazy_str_hard_failure
374
- ):
375
- fallback_entry = CharsetMatch(
376
- sequences,
377
- encoding_iana,
378
- threshold,
379
- False,
380
- [],
381
- decoded_payload,
382
- preemptive_declaration=specified_encoding,
383
- )
384
- if encoding_iana == specified_encoding:
385
- fallback_specified = fallback_entry
386
- elif encoding_iana == "ascii":
387
- fallback_ascii = fallback_entry
388
- else:
389
- fallback_u8 = fallback_entry
390
- continue
391
-
392
- logger.log(
393
- TRACE,
394
- "%s passed initial chaos probing. Mean measured chaos is %f %%",
395
- encoding_iana,
396
- round(mean_mess_ratio * 100, ndigits=3),
397
- )
398
-
399
- if not is_multi_byte_decoder:
400
- target_languages: list[str] = encoding_languages(encoding_iana)
401
- else:
402
- target_languages = mb_encoding_languages(encoding_iana)
403
-
404
- if target_languages:
405
- logger.log(
406
- TRACE,
407
- "{} should target any language(s) of {}".format(
408
- encoding_iana, str(target_languages)
409
- ),
410
- )
411
-
412
- cd_ratios = []
413
-
414
- # We shall skip the CD when its about ASCII
415
- # Most of the time its not relevant to run "language-detection" on it.
416
- if encoding_iana != "ascii":
417
- for chunk in md_chunks:
418
- chunk_languages = coherence_ratio(
419
- chunk,
420
- language_threshold,
421
- ",".join(target_languages) if target_languages else None,
422
- )
423
-
424
- cd_ratios.append(chunk_languages)
425
-
426
- cd_ratios_merged = merge_coherence_ratios(cd_ratios)
427
-
428
- if cd_ratios_merged:
429
- logger.log(
430
- TRACE,
431
- "We detected language {} using {}".format(
432
- cd_ratios_merged, encoding_iana
433
- ),
434
- )
435
-
436
- current_match = CharsetMatch(
437
- sequences,
438
- encoding_iana,
439
- mean_mess_ratio,
440
- bom_or_sig_available,
441
- cd_ratios_merged,
442
- (
443
- decoded_payload
444
- if (
445
- is_too_large_sequence is False
446
- or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447
- )
448
- else None
449
- ),
450
- preemptive_declaration=specified_encoding,
451
- )
452
-
453
- results.append(current_match)
454
-
455
- if (
456
- encoding_iana in [specified_encoding, "ascii", "utf_8"]
457
- and mean_mess_ratio < 0.1
458
- ):
459
- # If md says nothing to worry about, then... stop immediately!
460
- if mean_mess_ratio == 0.0:
461
- logger.debug(
462
- "Encoding detection: %s is most likely the one.",
463
- current_match.encoding,
464
- )
465
- if explain: # Defensive: ensure exit path clean handler
466
- logger.removeHandler(explain_handler)
467
- logger.setLevel(previous_logger_level)
468
- return CharsetMatches([current_match])
469
-
470
- early_stop_results.append(current_match)
471
-
472
- if (
473
- len(early_stop_results)
474
- and (specified_encoding is None or specified_encoding in tested)
475
- and "ascii" in tested
476
- and "utf_8" in tested
477
- ):
478
- probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
479
- logger.debug(
480
- "Encoding detection: %s is most likely the one.",
481
- probable_result.encoding,
482
- )
483
- if explain: # Defensive: ensure exit path clean handler
484
- logger.removeHandler(explain_handler)
485
- logger.setLevel(previous_logger_level)
486
-
487
- return CharsetMatches([probable_result])
488
-
489
- if encoding_iana == sig_encoding:
490
- logger.debug(
491
- "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
492
- "the beginning of the sequence.",
493
- encoding_iana,
494
- )
495
- if explain: # Defensive: ensure exit path clean handler
496
- logger.removeHandler(explain_handler)
497
- logger.setLevel(previous_logger_level)
498
- return CharsetMatches([results[encoding_iana]])
499
-
500
- if len(results) == 0:
501
- if fallback_u8 or fallback_ascii or fallback_specified:
502
- logger.log(
503
- TRACE,
504
- "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
505
- )
506
-
507
- if fallback_specified:
508
- logger.debug(
509
- "Encoding detection: %s will be used as a fallback match",
510
- fallback_specified.encoding,
511
- )
512
- results.append(fallback_specified)
513
- elif (
514
- (fallback_u8 and fallback_ascii is None)
515
- or (
516
- fallback_u8
517
- and fallback_ascii
518
- and fallback_u8.fingerprint != fallback_ascii.fingerprint
519
- )
520
- or (fallback_u8 is not None)
521
- ):
522
- logger.debug("Encoding detection: utf_8 will be used as a fallback match")
523
- results.append(fallback_u8)
524
- elif fallback_ascii:
525
- logger.debug("Encoding detection: ascii will be used as a fallback match")
526
- results.append(fallback_ascii)
527
-
528
- if results:
529
- logger.debug(
530
- "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
531
- results.best().encoding, # type: ignore
532
- len(results) - 1,
533
- )
534
- else:
535
- logger.debug("Encoding detection: Unable to determine any suitable charset.")
536
-
537
- if explain:
538
- logger.removeHandler(explain_handler)
539
- logger.setLevel(previous_logger_level)
540
-
541
- return results
542
-
543
-
544
- def from_fp(
545
- fp: BinaryIO,
546
- steps: int = 5,
547
- chunk_size: int = 512,
548
- threshold: float = 0.20,
549
- cp_isolation: list[str] | None = None,
550
- cp_exclusion: list[str] | None = None,
551
- preemptive_behaviour: bool = True,
552
- explain: bool = False,
553
- language_threshold: float = 0.1,
554
- enable_fallback: bool = True,
555
- ) -> CharsetMatches:
556
- """
557
- Same thing than the function from_bytes but using a file pointer that is already ready.
558
- Will not close the file pointer.
559
- """
560
- return from_bytes(
561
- fp.read(),
562
- steps,
563
- chunk_size,
564
- threshold,
565
- cp_isolation,
566
- cp_exclusion,
567
- preemptive_behaviour,
568
- explain,
569
- language_threshold,
570
- enable_fallback,
571
- )
572
-
573
-
574
- def from_path(
575
- path: str | bytes | PathLike, # type: ignore[type-arg]
576
- steps: int = 5,
577
- chunk_size: int = 512,
578
- threshold: float = 0.20,
579
- cp_isolation: list[str] | None = None,
580
- cp_exclusion: list[str] | None = None,
581
- preemptive_behaviour: bool = True,
582
- explain: bool = False,
583
- language_threshold: float = 0.1,
584
- enable_fallback: bool = True,
585
- ) -> CharsetMatches:
586
- """
587
- Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
588
- Can raise IOError.
589
- """
590
- with open(path, "rb") as fp:
591
- return from_fp(
592
- fp,
593
- steps,
594
- chunk_size,
595
- threshold,
596
- cp_isolation,
597
- cp_exclusion,
598
- preemptive_behaviour,
599
- explain,
600
- language_threshold,
601
- enable_fallback,
602
- )
603
-
604
-
605
- def is_binary(
606
- fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
607
- steps: int = 5,
608
- chunk_size: int = 512,
609
- threshold: float = 0.20,
610
- cp_isolation: list[str] | None = None,
611
- cp_exclusion: list[str] | None = None,
612
- preemptive_behaviour: bool = True,
613
- explain: bool = False,
614
- language_threshold: float = 0.1,
615
- enable_fallback: bool = False,
616
- ) -> bool:
617
- """
618
- Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
- Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
- are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
- """
622
- if isinstance(fp_or_path_or_payload, (str, PathLike)):
623
- guesses = from_path(
624
- fp_or_path_or_payload,
625
- steps=steps,
626
- chunk_size=chunk_size,
627
- threshold=threshold,
628
- cp_isolation=cp_isolation,
629
- cp_exclusion=cp_exclusion,
630
- preemptive_behaviour=preemptive_behaviour,
631
- explain=explain,
632
- language_threshold=language_threshold,
633
- enable_fallback=enable_fallback,
634
- )
635
- elif isinstance(
636
- fp_or_path_or_payload,
637
- (
638
- bytes,
639
- bytearray,
640
- ),
641
- ):
642
- guesses = from_bytes(
643
- fp_or_path_or_payload,
644
- steps=steps,
645
- chunk_size=chunk_size,
646
- threshold=threshold,
647
- cp_isolation=cp_isolation,
648
- cp_exclusion=cp_exclusion,
649
- preemptive_behaviour=preemptive_behaviour,
650
- explain=explain,
651
- language_threshold=language_threshold,
652
- enable_fallback=enable_fallback,
653
- )
654
- else:
655
- guesses = from_fp(
656
- fp_or_path_or_payload,
657
- steps=steps,
658
- chunk_size=chunk_size,
659
- threshold=threshold,
660
- cp_isolation=cp_isolation,
661
- cp_exclusion=cp_exclusion,
662
- preemptive_behaviour=preemptive_behaviour,
663
- explain=explain,
664
- language_threshold=language_threshold,
665
- enable_fallback=enable_fallback,
666
- )
667
-
668
- return not guesses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/cd.py DELETED
@@ -1,395 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import importlib
4
- from codecs import IncrementalDecoder
5
- from collections import Counter
6
- from functools import lru_cache
7
- from typing import Counter as TypeCounter
8
-
9
- from .constant import (
10
- FREQUENCIES,
11
- KO_NAMES,
12
- LANGUAGE_SUPPORTED_COUNT,
13
- TOO_SMALL_SEQUENCE,
14
- ZH_NAMES,
15
- )
16
- from .md import is_suspiciously_successive_range
17
- from .models import CoherenceMatches
18
- from .utils import (
19
- is_accentuated,
20
- is_latin,
21
- is_multi_byte_encoding,
22
- is_unicode_range_secondary,
23
- unicode_range,
24
- )
25
-
26
-
27
- def encoding_unicode_range(iana_name: str) -> list[str]:
28
- """
29
- Return associated unicode ranges in a single byte code page.
30
- """
31
- if is_multi_byte_encoding(iana_name):
32
- raise OSError("Function not supported on multi-byte code page")
33
-
34
- decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
-
36
- p: IncrementalDecoder = decoder(errors="ignore")
37
- seen_ranges: dict[str, int] = {}
38
- character_count: int = 0
39
-
40
- for i in range(0x40, 0xFF):
41
- chunk: str = p.decode(bytes([i]))
42
-
43
- if chunk:
44
- character_range: str | None = unicode_range(chunk)
45
-
46
- if character_range is None:
47
- continue
48
-
49
- if is_unicode_range_secondary(character_range) is False:
50
- if character_range not in seen_ranges:
51
- seen_ranges[character_range] = 0
52
- seen_ranges[character_range] += 1
53
- character_count += 1
54
-
55
- return sorted(
56
- [
57
- character_range
58
- for character_range in seen_ranges
59
- if seen_ranges[character_range] / character_count >= 0.15
60
- ]
61
- )
62
-
63
-
64
- def unicode_range_languages(primary_range: str) -> list[str]:
65
- """
66
- Return inferred languages used with a unicode range.
67
- """
68
- languages: list[str] = []
69
-
70
- for language, characters in FREQUENCIES.items():
71
- for character in characters:
72
- if unicode_range(character) == primary_range:
73
- languages.append(language)
74
- break
75
-
76
- return languages
77
-
78
-
79
- @lru_cache()
80
- def encoding_languages(iana_name: str) -> list[str]:
81
- """
82
- Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
- This function does the correspondence.
84
- """
85
- unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
- primary_range: str | None = None
87
-
88
- for specified_range in unicode_ranges:
89
- if "Latin" not in specified_range:
90
- primary_range = specified_range
91
- break
92
-
93
- if primary_range is None:
94
- return ["Latin Based"]
95
-
96
- return unicode_range_languages(primary_range)
97
-
98
-
99
- @lru_cache()
100
- def mb_encoding_languages(iana_name: str) -> list[str]:
101
- """
102
- Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
- This function does the correspondence.
104
- """
105
- if (
106
- iana_name.startswith("shift_")
107
- or iana_name.startswith("iso2022_jp")
108
- or iana_name.startswith("euc_j")
109
- or iana_name == "cp932"
110
- ):
111
- return ["Japanese"]
112
- if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
- return ["Chinese"]
114
- if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
- return ["Korean"]
116
-
117
- return []
118
-
119
-
120
- @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
- def get_target_features(language: str) -> tuple[bool, bool]:
122
- """
123
- Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
- """
125
- target_have_accents: bool = False
126
- target_pure_latin: bool = True
127
-
128
- for character in FREQUENCIES[language]:
129
- if not target_have_accents and is_accentuated(character):
130
- target_have_accents = True
131
- if target_pure_latin and is_latin(character) is False:
132
- target_pure_latin = False
133
-
134
- return target_have_accents, target_pure_latin
135
-
136
-
137
- def alphabet_languages(
138
- characters: list[str], ignore_non_latin: bool = False
139
- ) -> list[str]:
140
- """
141
- Return associated languages associated to given characters.
142
- """
143
- languages: list[tuple[str, float]] = []
144
-
145
- source_have_accents = any(is_accentuated(character) for character in characters)
146
-
147
- for language, language_characters in FREQUENCIES.items():
148
- target_have_accents, target_pure_latin = get_target_features(language)
149
-
150
- if ignore_non_latin and target_pure_latin is False:
151
- continue
152
-
153
- if target_have_accents is False and source_have_accents:
154
- continue
155
-
156
- character_count: int = len(language_characters)
157
-
158
- character_match_count: int = len(
159
- [c for c in language_characters if c in characters]
160
- )
161
-
162
- ratio: float = character_match_count / character_count
163
-
164
- if ratio >= 0.2:
165
- languages.append((language, ratio))
166
-
167
- languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
-
169
- return [compatible_language[0] for compatible_language in languages]
170
-
171
-
172
- def characters_popularity_compare(
173
- language: str, ordered_characters: list[str]
174
- ) -> float:
175
- """
176
- Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
- The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
- Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
- """
180
- if language not in FREQUENCIES:
181
- raise ValueError(f"{language} not available")
182
-
183
- character_approved_count: int = 0
184
- FREQUENCIES_language_set = set(FREQUENCIES[language])
185
-
186
- ordered_characters_count: int = len(ordered_characters)
187
- target_language_characters_count: int = len(FREQUENCIES[language])
188
-
189
- large_alphabet: bool = target_language_characters_count > 26
190
-
191
- for character, character_rank in zip(
192
- ordered_characters, range(0, ordered_characters_count)
193
- ):
194
- if character not in FREQUENCIES_language_set:
195
- continue
196
-
197
- character_rank_in_language: int = FREQUENCIES[language].index(character)
198
- expected_projection_ratio: float = (
199
- target_language_characters_count / ordered_characters_count
200
- )
201
- character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
-
203
- if (
204
- large_alphabet is False
205
- and abs(character_rank_projection - character_rank_in_language) > 4
206
- ):
207
- continue
208
-
209
- if (
210
- large_alphabet is True
211
- and abs(character_rank_projection - character_rank_in_language)
212
- < target_language_characters_count / 3
213
- ):
214
- character_approved_count += 1
215
- continue
216
-
217
- characters_before_source: list[str] = FREQUENCIES[language][
218
- 0:character_rank_in_language
219
- ]
220
- characters_after_source: list[str] = FREQUENCIES[language][
221
- character_rank_in_language:
222
- ]
223
- characters_before: list[str] = ordered_characters[0:character_rank]
224
- characters_after: list[str] = ordered_characters[character_rank:]
225
-
226
- before_match_count: int = len(
227
- set(characters_before) & set(characters_before_source)
228
- )
229
-
230
- after_match_count: int = len(
231
- set(characters_after) & set(characters_after_source)
232
- )
233
-
234
- if len(characters_before_source) == 0 and before_match_count <= 4:
235
- character_approved_count += 1
236
- continue
237
-
238
- if len(characters_after_source) == 0 and after_match_count <= 4:
239
- character_approved_count += 1
240
- continue
241
-
242
- if (
243
- before_match_count / len(characters_before_source) >= 0.4
244
- or after_match_count / len(characters_after_source) >= 0.4
245
- ):
246
- character_approved_count += 1
247
- continue
248
-
249
- return character_approved_count / len(ordered_characters)
250
-
251
-
252
- def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
- """
254
- Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
- Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
- One containing the latin letters and the other hebrew.
257
- """
258
- layers: dict[str, str] = {}
259
-
260
- for character in decoded_sequence:
261
- if character.isalpha() is False:
262
- continue
263
-
264
- character_range: str | None = unicode_range(character)
265
-
266
- if character_range is None:
267
- continue
268
-
269
- layer_target_range: str | None = None
270
-
271
- for discovered_range in layers:
272
- if (
273
- is_suspiciously_successive_range(discovered_range, character_range)
274
- is False
275
- ):
276
- layer_target_range = discovered_range
277
- break
278
-
279
- if layer_target_range is None:
280
- layer_target_range = character_range
281
-
282
- if layer_target_range not in layers:
283
- layers[layer_target_range] = character.lower()
284
- continue
285
-
286
- layers[layer_target_range] += character.lower()
287
-
288
- return list(layers.values())
289
-
290
-
291
- def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
- """
293
- This function merge results previously given by the function coherence_ratio.
294
- The return type is the same as coherence_ratio.
295
- """
296
- per_language_ratios: dict[str, list[float]] = {}
297
- for result in results:
298
- for sub_result in result:
299
- language, ratio = sub_result
300
- if language not in per_language_ratios:
301
- per_language_ratios[language] = [ratio]
302
- continue
303
- per_language_ratios[language].append(ratio)
304
-
305
- merge = [
306
- (
307
- language,
308
- round(
309
- sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
- 4,
311
- ),
312
- )
313
- for language in per_language_ratios
314
- ]
315
-
316
- return sorted(merge, key=lambda x: x[1], reverse=True)
317
-
318
-
319
- def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
- """
321
- We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
- of "English". This function only keeps the best match and remove the em-dash in it.
323
- """
324
- index_results: dict[str, list[float]] = dict()
325
-
326
- for result in results:
327
- language, ratio = result
328
- no_em_name: str = language.replace("—", "")
329
-
330
- if no_em_name not in index_results:
331
- index_results[no_em_name] = []
332
-
333
- index_results[no_em_name].append(ratio)
334
-
335
- if any(len(index_results[e]) > 1 for e in index_results):
336
- filtered_results: CoherenceMatches = []
337
-
338
- for language in index_results:
339
- filtered_results.append((language, max(index_results[language])))
340
-
341
- return filtered_results
342
-
343
- return results
344
-
345
-
346
- @lru_cache(maxsize=2048)
347
- def coherence_ratio(
348
- decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
- ) -> CoherenceMatches:
350
- """
351
- Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
- A layer = Character extraction by alphabets/ranges.
353
- """
354
-
355
- results: list[tuple[str, float]] = []
356
- ignore_non_latin: bool = False
357
-
358
- sufficient_match_count: int = 0
359
-
360
- lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
- if "Latin Based" in lg_inclusion_list:
362
- ignore_non_latin = True
363
- lg_inclusion_list.remove("Latin Based")
364
-
365
- for layer in alpha_unicode_split(decoded_sequence):
366
- sequence_frequencies: TypeCounter[str] = Counter(layer)
367
- most_common = sequence_frequencies.most_common()
368
-
369
- character_count: int = sum(o for c, o in most_common)
370
-
371
- if character_count <= TOO_SMALL_SEQUENCE:
372
- continue
373
-
374
- popular_character_ordered: list[str] = [c for c, o in most_common]
375
-
376
- for language in lg_inclusion_list or alphabet_languages(
377
- popular_character_ordered, ignore_non_latin
378
- ):
379
- ratio: float = characters_popularity_compare(
380
- language, popular_character_ordered
381
- )
382
-
383
- if ratio < threshold:
384
- continue
385
- elif ratio >= 0.8:
386
- sufficient_match_count += 1
387
-
388
- results.append((language, round(ratio, 4)))
389
-
390
- if sufficient_match_count >= 3:
391
- break
392
-
393
- return sorted(
394
- filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/cli/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from .__main__ import cli_detect, query_yes_no
4
-
5
- __all__ = (
6
- "cli_detect",
7
- "query_yes_no",
8
- )
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/cli/__main__.py DELETED
@@ -1,321 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import sys
5
- from json import dumps
6
- from os.path import abspath, basename, dirname, join, realpath
7
- from platform import python_version
8
- from unicodedata import unidata_version
9
-
10
- import charset_normalizer.md as md_module
11
- from charset_normalizer import from_fp
12
- from charset_normalizer.models import CliDetectionResult
13
- from charset_normalizer.version import __version__
14
-
15
-
16
- def query_yes_no(question: str, default: str = "yes") -> bool:
17
- """Ask a yes/no question via input() and return their answer.
18
-
19
- "question" is a string that is presented to the user.
20
- "default" is the presumed answer if the user just hits <Enter>.
21
- It must be "yes" (the default), "no" or None (meaning
22
- an answer is required of the user).
23
-
24
- The "answer" return value is True for "yes" or False for "no".
25
-
26
- Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
27
- """
28
- valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
29
- if default is None:
30
- prompt = " [y/n] "
31
- elif default == "yes":
32
- prompt = " [Y/n] "
33
- elif default == "no":
34
- prompt = " [y/N] "
35
- else:
36
- raise ValueError("invalid default answer: '%s'" % default)
37
-
38
- while True:
39
- sys.stdout.write(question + prompt)
40
- choice = input().lower()
41
- if default is not None and choice == "":
42
- return valid[default]
43
- elif choice in valid:
44
- return valid[choice]
45
- else:
46
- sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
47
-
48
-
49
- def cli_detect(argv: list[str] | None = None) -> int:
50
- """
51
- CLI assistant using ARGV and ArgumentParser
52
- :param argv:
53
- :return: 0 if everything is fine, anything else equal trouble
54
- """
55
- parser = argparse.ArgumentParser(
56
- description="The Real First Universal Charset Detector. "
57
- "Discover originating encoding used on text file. "
58
- "Normalize text to unicode."
59
- )
60
-
61
- parser.add_argument(
62
- "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
63
- )
64
- parser.add_argument(
65
- "-v",
66
- "--verbose",
67
- action="store_true",
68
- default=False,
69
- dest="verbose",
70
- help="Display complementary information about file if any. "
71
- "Stdout will contain logs about the detection process.",
72
- )
73
- parser.add_argument(
74
- "-a",
75
- "--with-alternative",
76
- action="store_true",
77
- default=False,
78
- dest="alternatives",
79
- help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
80
- )
81
- parser.add_argument(
82
- "-n",
83
- "--normalize",
84
- action="store_true",
85
- default=False,
86
- dest="normalize",
87
- help="Permit to normalize input file. If not set, program does not write anything.",
88
- )
89
- parser.add_argument(
90
- "-m",
91
- "--minimal",
92
- action="store_true",
93
- default=False,
94
- dest="minimal",
95
- help="Only output the charset detected to STDOUT. Disabling JSON output.",
96
- )
97
- parser.add_argument(
98
- "-r",
99
- "--replace",
100
- action="store_true",
101
- default=False,
102
- dest="replace",
103
- help="Replace file when trying to normalize it instead of creating a new one.",
104
- )
105
- parser.add_argument(
106
- "-f",
107
- "--force",
108
- action="store_true",
109
- default=False,
110
- dest="force",
111
- help="Replace file without asking if you are sure, use this flag with caution.",
112
- )
113
- parser.add_argument(
114
- "-i",
115
- "--no-preemptive",
116
- action="store_true",
117
- default=False,
118
- dest="no_preemptive",
119
- help="Disable looking at a charset declaration to hint the detector.",
120
- )
121
- parser.add_argument(
122
- "-t",
123
- "--threshold",
124
- action="store",
125
- default=0.2,
126
- type=float,
127
- dest="threshold",
128
- help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
129
- )
130
- parser.add_argument(
131
- "--version",
132
- action="version",
133
- version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
134
- __version__,
135
- python_version(),
136
- unidata_version,
137
- "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
138
- ),
139
- help="Show version information and exit.",
140
- )
141
-
142
- args = parser.parse_args(argv)
143
-
144
- if args.replace is True and args.normalize is False:
145
- if args.files:
146
- for my_file in args.files:
147
- my_file.close()
148
- print("Use --replace in addition of --normalize only.", file=sys.stderr)
149
- return 1
150
-
151
- if args.force is True and args.replace is False:
152
- if args.files:
153
- for my_file in args.files:
154
- my_file.close()
155
- print("Use --force in addition of --replace only.", file=sys.stderr)
156
- return 1
157
-
158
- if args.threshold < 0.0 or args.threshold > 1.0:
159
- if args.files:
160
- for my_file in args.files:
161
- my_file.close()
162
- print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
163
- return 1
164
-
165
- x_ = []
166
-
167
- for my_file in args.files:
168
- matches = from_fp(
169
- my_file,
170
- threshold=args.threshold,
171
- explain=args.verbose,
172
- preemptive_behaviour=args.no_preemptive is False,
173
- )
174
-
175
- best_guess = matches.best()
176
-
177
- if best_guess is None:
178
- print(
179
- 'Unable to identify originating encoding for "{}". {}'.format(
180
- my_file.name,
181
- (
182
- "Maybe try increasing maximum amount of chaos."
183
- if args.threshold < 1.0
184
- else ""
185
- ),
186
- ),
187
- file=sys.stderr,
188
- )
189
- x_.append(
190
- CliDetectionResult(
191
- abspath(my_file.name),
192
- None,
193
- [],
194
- [],
195
- "Unknown",
196
- [],
197
- False,
198
- 1.0,
199
- 0.0,
200
- None,
201
- True,
202
- )
203
- )
204
- else:
205
- x_.append(
206
- CliDetectionResult(
207
- abspath(my_file.name),
208
- best_guess.encoding,
209
- best_guess.encoding_aliases,
210
- [
211
- cp
212
- for cp in best_guess.could_be_from_charset
213
- if cp != best_guess.encoding
214
- ],
215
- best_guess.language,
216
- best_guess.alphabets,
217
- best_guess.bom,
218
- best_guess.percent_chaos,
219
- best_guess.percent_coherence,
220
- None,
221
- True,
222
- )
223
- )
224
-
225
- if len(matches) > 1 and args.alternatives:
226
- for el in matches:
227
- if el != best_guess:
228
- x_.append(
229
- CliDetectionResult(
230
- abspath(my_file.name),
231
- el.encoding,
232
- el.encoding_aliases,
233
- [
234
- cp
235
- for cp in el.could_be_from_charset
236
- if cp != el.encoding
237
- ],
238
- el.language,
239
- el.alphabets,
240
- el.bom,
241
- el.percent_chaos,
242
- el.percent_coherence,
243
- None,
244
- False,
245
- )
246
- )
247
-
248
- if args.normalize is True:
249
- if best_guess.encoding.startswith("utf") is True:
250
- print(
251
- '"{}" file does not need to be normalized, as it already came from unicode.'.format(
252
- my_file.name
253
- ),
254
- file=sys.stderr,
255
- )
256
- if my_file.closed is False:
257
- my_file.close()
258
- continue
259
-
260
- dir_path = dirname(realpath(my_file.name))
261
- file_name = basename(realpath(my_file.name))
262
-
263
- o_: list[str] = file_name.split(".")
264
-
265
- if args.replace is False:
266
- o_.insert(-1, best_guess.encoding)
267
- if my_file.closed is False:
268
- my_file.close()
269
- elif (
270
- args.force is False
271
- and query_yes_no(
272
- 'Are you sure to normalize "{}" by replacing it ?'.format(
273
- my_file.name
274
- ),
275
- "no",
276
- )
277
- is False
278
- ):
279
- if my_file.closed is False:
280
- my_file.close()
281
- continue
282
-
283
- try:
284
- x_[0].unicode_path = join(dir_path, ".".join(o_))
285
-
286
- with open(x_[0].unicode_path, "wb") as fp:
287
- fp.write(best_guess.output())
288
- except OSError as e:
289
- print(str(e), file=sys.stderr)
290
- if my_file.closed is False:
291
- my_file.close()
292
- return 2
293
-
294
- if my_file.closed is False:
295
- my_file.close()
296
-
297
- if args.minimal is False:
298
- print(
299
- dumps(
300
- [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
301
- ensure_ascii=True,
302
- indent=4,
303
- )
304
- )
305
- else:
306
- for my_file in args.files:
307
- print(
308
- ", ".join(
309
- [
310
- el.encoding or "undefined"
311
- for el in x_
312
- if el.path == abspath(my_file.name)
313
- ]
314
- )
315
- )
316
-
317
- return 0
318
-
319
-
320
- if __name__ == "__main__":
321
- cli_detect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/constant.py DELETED
@@ -1,1998 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
- from encodings.aliases import aliases
5
- from re import IGNORECASE
6
- from re import compile as re_compile
7
-
8
- # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
- ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
- "utf_8": BOM_UTF8,
11
- "utf_7": [
12
- b"\x2b\x2f\x76\x38",
13
- b"\x2b\x2f\x76\x39",
14
- b"\x2b\x2f\x76\x2b",
15
- b"\x2b\x2f\x76\x2f",
16
- b"\x2b\x2f\x76\x38\x2d",
17
- ],
18
- "gb18030": b"\x84\x31\x95\x33",
19
- "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
- "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
- }
22
-
23
- TOO_SMALL_SEQUENCE: int = 32
24
- TOO_BIG_SEQUENCE: int = int(10e6)
25
-
26
- UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
-
28
- # Up-to-date Unicode ucd/15.0.0
29
- UNICODE_RANGES_COMBINED: dict[str, range] = {
30
- "Control character": range(32),
31
- "Basic Latin": range(32, 128),
32
- "Latin-1 Supplement": range(128, 256),
33
- "Latin Extended-A": range(256, 384),
34
- "Latin Extended-B": range(384, 592),
35
- "IPA Extensions": range(592, 688),
36
- "Spacing Modifier Letters": range(688, 768),
37
- "Combining Diacritical Marks": range(768, 880),
38
- "Greek and Coptic": range(880, 1024),
39
- "Cyrillic": range(1024, 1280),
40
- "Cyrillic Supplement": range(1280, 1328),
41
- "Armenian": range(1328, 1424),
42
- "Hebrew": range(1424, 1536),
43
- "Arabic": range(1536, 1792),
44
- "Syriac": range(1792, 1872),
45
- "Arabic Supplement": range(1872, 1920),
46
- "Thaana": range(1920, 1984),
47
- "NKo": range(1984, 2048),
48
- "Samaritan": range(2048, 2112),
49
- "Mandaic": range(2112, 2144),
50
- "Syriac Supplement": range(2144, 2160),
51
- "Arabic Extended-B": range(2160, 2208),
52
- "Arabic Extended-A": range(2208, 2304),
53
- "Devanagari": range(2304, 2432),
54
- "Bengali": range(2432, 2560),
55
- "Gurmukhi": range(2560, 2688),
56
- "Gujarati": range(2688, 2816),
57
- "Oriya": range(2816, 2944),
58
- "Tamil": range(2944, 3072),
59
- "Telugu": range(3072, 3200),
60
- "Kannada": range(3200, 3328),
61
- "Malayalam": range(3328, 3456),
62
- "Sinhala": range(3456, 3584),
63
- "Thai": range(3584, 3712),
64
- "Lao": range(3712, 3840),
65
- "Tibetan": range(3840, 4096),
66
- "Myanmar": range(4096, 4256),
67
- "Georgian": range(4256, 4352),
68
- "Hangul Jamo": range(4352, 4608),
69
- "Ethiopic": range(4608, 4992),
70
- "Ethiopic Supplement": range(4992, 5024),
71
- "Cherokee": range(5024, 5120),
72
- "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
- "Ogham": range(5760, 5792),
74
- "Runic": range(5792, 5888),
75
- "Tagalog": range(5888, 5920),
76
- "Hanunoo": range(5920, 5952),
77
- "Buhid": range(5952, 5984),
78
- "Tagbanwa": range(5984, 6016),
79
- "Khmer": range(6016, 6144),
80
- "Mongolian": range(6144, 6320),
81
- "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
- "Limbu": range(6400, 6480),
83
- "Tai Le": range(6480, 6528),
84
- "New Tai Lue": range(6528, 6624),
85
- "Khmer Symbols": range(6624, 6656),
86
- "Buginese": range(6656, 6688),
87
- "Tai Tham": range(6688, 6832),
88
- "Combining Diacritical Marks Extended": range(6832, 6912),
89
- "Balinese": range(6912, 7040),
90
- "Sundanese": range(7040, 7104),
91
- "Batak": range(7104, 7168),
92
- "Lepcha": range(7168, 7248),
93
- "Ol Chiki": range(7248, 7296),
94
- "Cyrillic Extended-C": range(7296, 7312),
95
- "Georgian Extended": range(7312, 7360),
96
- "Sundanese Supplement": range(7360, 7376),
97
- "Vedic Extensions": range(7376, 7424),
98
- "Phonetic Extensions": range(7424, 7552),
99
- "Phonetic Extensions Supplement": range(7552, 7616),
100
- "Combining Diacritical Marks Supplement": range(7616, 7680),
101
- "Latin Extended Additional": range(7680, 7936),
102
- "Greek Extended": range(7936, 8192),
103
- "General Punctuation": range(8192, 8304),
104
- "Superscripts and Subscripts": range(8304, 8352),
105
- "Currency Symbols": range(8352, 8400),
106
- "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
- "Letterlike Symbols": range(8448, 8528),
108
- "Number Forms": range(8528, 8592),
109
- "Arrows": range(8592, 8704),
110
- "Mathematical Operators": range(8704, 8960),
111
- "Miscellaneous Technical": range(8960, 9216),
112
- "Control Pictures": range(9216, 9280),
113
- "Optical Character Recognition": range(9280, 9312),
114
- "Enclosed Alphanumerics": range(9312, 9472),
115
- "Box Drawing": range(9472, 9600),
116
- "Block Elements": range(9600, 9632),
117
- "Geometric Shapes": range(9632, 9728),
118
- "Miscellaneous Symbols": range(9728, 9984),
119
- "Dingbats": range(9984, 10176),
120
- "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
- "Supplemental Arrows-A": range(10224, 10240),
122
- "Braille Patterns": range(10240, 10496),
123
- "Supplemental Arrows-B": range(10496, 10624),
124
- "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
- "Supplemental Mathematical Operators": range(10752, 11008),
126
- "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
- "Glagolitic": range(11264, 11360),
128
- "Latin Extended-C": range(11360, 11392),
129
- "Coptic": range(11392, 11520),
130
- "Georgian Supplement": range(11520, 11568),
131
- "Tifinagh": range(11568, 11648),
132
- "Ethiopic Extended": range(11648, 11744),
133
- "Cyrillic Extended-A": range(11744, 11776),
134
- "Supplemental Punctuation": range(11776, 11904),
135
- "CJK Radicals Supplement": range(11904, 12032),
136
- "Kangxi Radicals": range(12032, 12256),
137
- "Ideographic Description Characters": range(12272, 12288),
138
- "CJK Symbols and Punctuation": range(12288, 12352),
139
- "Hiragana": range(12352, 12448),
140
- "Katakana": range(12448, 12544),
141
- "Bopomofo": range(12544, 12592),
142
- "Hangul Compatibility Jamo": range(12592, 12688),
143
- "Kanbun": range(12688, 12704),
144
- "Bopomofo Extended": range(12704, 12736),
145
- "CJK Strokes": range(12736, 12784),
146
- "Katakana Phonetic Extensions": range(12784, 12800),
147
- "Enclosed CJK Letters and Months": range(12800, 13056),
148
- "CJK Compatibility": range(13056, 13312),
149
- "CJK Unified Ideographs Extension A": range(13312, 19904),
150
- "Yijing Hexagram Symbols": range(19904, 19968),
151
- "CJK Unified Ideographs": range(19968, 40960),
152
- "Yi Syllables": range(40960, 42128),
153
- "Yi Radicals": range(42128, 42192),
154
- "Lisu": range(42192, 42240),
155
- "Vai": range(42240, 42560),
156
- "Cyrillic Extended-B": range(42560, 42656),
157
- "Bamum": range(42656, 42752),
158
- "Modifier Tone Letters": range(42752, 42784),
159
- "Latin Extended-D": range(42784, 43008),
160
- "Syloti Nagri": range(43008, 43056),
161
- "Common Indic Number Forms": range(43056, 43072),
162
- "Phags-pa": range(43072, 43136),
163
- "Saurashtra": range(43136, 43232),
164
- "Devanagari Extended": range(43232, 43264),
165
- "Kayah Li": range(43264, 43312),
166
- "Rejang": range(43312, 43360),
167
- "Hangul Jamo Extended-A": range(43360, 43392),
168
- "Javanese": range(43392, 43488),
169
- "Myanmar Extended-B": range(43488, 43520),
170
- "Cham": range(43520, 43616),
171
- "Myanmar Extended-A": range(43616, 43648),
172
- "Tai Viet": range(43648, 43744),
173
- "Meetei Mayek Extensions": range(43744, 43776),
174
- "Ethiopic Extended-A": range(43776, 43824),
175
- "Latin Extended-E": range(43824, 43888),
176
- "Cherokee Supplement": range(43888, 43968),
177
- "Meetei Mayek": range(43968, 44032),
178
- "Hangul Syllables": range(44032, 55216),
179
- "Hangul Jamo Extended-B": range(55216, 55296),
180
- "High Surrogates": range(55296, 56192),
181
- "High Private Use Surrogates": range(56192, 56320),
182
- "Low Surrogates": range(56320, 57344),
183
- "Private Use Area": range(57344, 63744),
184
- "CJK Compatibility Ideographs": range(63744, 64256),
185
- "Alphabetic Presentation Forms": range(64256, 64336),
186
- "Arabic Presentation Forms-A": range(64336, 65024),
187
- "Variation Selectors": range(65024, 65040),
188
- "Vertical Forms": range(65040, 65056),
189
- "Combining Half Marks": range(65056, 65072),
190
- "CJK Compatibility Forms": range(65072, 65104),
191
- "Small Form Variants": range(65104, 65136),
192
- "Arabic Presentation Forms-B": range(65136, 65280),
193
- "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
- "Specials": range(65520, 65536),
195
- "Linear B Syllabary": range(65536, 65664),
196
- "Linear B Ideograms": range(65664, 65792),
197
- "Aegean Numbers": range(65792, 65856),
198
- "Ancient Greek Numbers": range(65856, 65936),
199
- "Ancient Symbols": range(65936, 66000),
200
- "Phaistos Disc": range(66000, 66048),
201
- "Lycian": range(66176, 66208),
202
- "Carian": range(66208, 66272),
203
- "Coptic Epact Numbers": range(66272, 66304),
204
- "Old Italic": range(66304, 66352),
205
- "Gothic": range(66352, 66384),
206
- "Old Permic": range(66384, 66432),
207
- "Ugaritic": range(66432, 66464),
208
- "Old Persian": range(66464, 66528),
209
- "Deseret": range(66560, 66640),
210
- "Shavian": range(66640, 66688),
211
- "Osmanya": range(66688, 66736),
212
- "Osage": range(66736, 66816),
213
- "Elbasan": range(66816, 66864),
214
- "Caucasian Albanian": range(66864, 66928),
215
- "Vithkuqi": range(66928, 67008),
216
- "Linear A": range(67072, 67456),
217
- "Latin Extended-F": range(67456, 67520),
218
- "Cypriot Syllabary": range(67584, 67648),
219
- "Imperial Aramaic": range(67648, 67680),
220
- "Palmyrene": range(67680, 67712),
221
- "Nabataean": range(67712, 67760),
222
- "Hatran": range(67808, 67840),
223
- "Phoenician": range(67840, 67872),
224
- "Lydian": range(67872, 67904),
225
- "Meroitic Hieroglyphs": range(67968, 68000),
226
- "Meroitic Cursive": range(68000, 68096),
227
- "Kharoshthi": range(68096, 68192),
228
- "Old South Arabian": range(68192, 68224),
229
- "Old North Arabian": range(68224, 68256),
230
- "Manichaean": range(68288, 68352),
231
- "Avestan": range(68352, 68416),
232
- "Inscriptional Parthian": range(68416, 68448),
233
- "Inscriptional Pahlavi": range(68448, 68480),
234
- "Psalter Pahlavi": range(68480, 68528),
235
- "Old Turkic": range(68608, 68688),
236
- "Old Hungarian": range(68736, 68864),
237
- "Hanifi Rohingya": range(68864, 68928),
238
- "Rumi Numeral Symbols": range(69216, 69248),
239
- "Yezidi": range(69248, 69312),
240
- "Arabic Extended-C": range(69312, 69376),
241
- "Old Sogdian": range(69376, 69424),
242
- "Sogdian": range(69424, 69488),
243
- "Old Uyghur": range(69488, 69552),
244
- "Chorasmian": range(69552, 69600),
245
- "Elymaic": range(69600, 69632),
246
- "Brahmi": range(69632, 69760),
247
- "Kaithi": range(69760, 69840),
248
- "Sora Sompeng": range(69840, 69888),
249
- "Chakma": range(69888, 69968),
250
- "Mahajani": range(69968, 70016),
251
- "Sharada": range(70016, 70112),
252
- "Sinhala Archaic Numbers": range(70112, 70144),
253
- "Khojki": range(70144, 70224),
254
- "Multani": range(70272, 70320),
255
- "Khudawadi": range(70320, 70400),
256
- "Grantha": range(70400, 70528),
257
- "Newa": range(70656, 70784),
258
- "Tirhuta": range(70784, 70880),
259
- "Siddham": range(71040, 71168),
260
- "Modi": range(71168, 71264),
261
- "Mongolian Supplement": range(71264, 71296),
262
- "Takri": range(71296, 71376),
263
- "Ahom": range(71424, 71504),
264
- "Dogra": range(71680, 71760),
265
- "Warang Citi": range(71840, 71936),
266
- "Dives Akuru": range(71936, 72032),
267
- "Nandinagari": range(72096, 72192),
268
- "Zanabazar Square": range(72192, 72272),
269
- "Soyombo": range(72272, 72368),
270
- "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
271
- "Pau Cin Hau": range(72384, 72448),
272
- "Devanagari Extended-A": range(72448, 72544),
273
- "Bhaiksuki": range(72704, 72816),
274
- "Marchen": range(72816, 72896),
275
- "Masaram Gondi": range(72960, 73056),
276
- "Gunjala Gondi": range(73056, 73136),
277
- "Makasar": range(73440, 73472),
278
- "Kawi": range(73472, 73568),
279
- "Lisu Supplement": range(73648, 73664),
280
- "Tamil Supplement": range(73664, 73728),
281
- "Cuneiform": range(73728, 74752),
282
- "Cuneiform Numbers and Punctuation": range(74752, 74880),
283
- "Early Dynastic Cuneiform": range(74880, 75088),
284
- "Cypro-Minoan": range(77712, 77824),
285
- "Egyptian Hieroglyphs": range(77824, 78896),
286
- "Egyptian Hieroglyph Format Controls": range(78896, 78944),
287
- "Anatolian Hieroglyphs": range(82944, 83584),
288
- "Bamum Supplement": range(92160, 92736),
289
- "Mro": range(92736, 92784),
290
- "Tangsa": range(92784, 92880),
291
- "Bassa Vah": range(92880, 92928),
292
- "Pahawh Hmong": range(92928, 93072),
293
- "Medefaidrin": range(93760, 93856),
294
- "Miao": range(93952, 94112),
295
- "Ideographic Symbols and Punctuation": range(94176, 94208),
296
- "Tangut": range(94208, 100352),
297
- "Tangut Components": range(100352, 101120),
298
- "Khitan Small Script": range(101120, 101632),
299
- "Tangut Supplement": range(101632, 101760),
300
- "Kana Extended-B": range(110576, 110592),
301
- "Kana Supplement": range(110592, 110848),
302
- "Kana Extended-A": range(110848, 110896),
303
- "Small Kana Extension": range(110896, 110960),
304
- "Nushu": range(110960, 111360),
305
- "Duployan": range(113664, 113824),
306
- "Shorthand Format Controls": range(113824, 113840),
307
- "Znamenny Musical Notation": range(118528, 118736),
308
- "Byzantine Musical Symbols": range(118784, 119040),
309
- "Musical Symbols": range(119040, 119296),
310
- "Ancient Greek Musical Notation": range(119296, 119376),
311
- "Kaktovik Numerals": range(119488, 119520),
312
- "Mayan Numerals": range(119520, 119552),
313
- "Tai Xuan Jing Symbols": range(119552, 119648),
314
- "Counting Rod Numerals": range(119648, 119680),
315
- "Mathematical Alphanumeric Symbols": range(119808, 120832),
316
- "Sutton SignWriting": range(120832, 121520),
317
- "Latin Extended-G": range(122624, 122880),
318
- "Glagolitic Supplement": range(122880, 122928),
319
- "Cyrillic Extended-D": range(122928, 123024),
320
- "Nyiakeng Puachue Hmong": range(123136, 123216),
321
- "Toto": range(123536, 123584),
322
- "Wancho": range(123584, 123648),
323
- "Nag Mundari": range(124112, 124160),
324
- "Ethiopic Extended-B": range(124896, 124928),
325
- "Mende Kikakui": range(124928, 125152),
326
- "Adlam": range(125184, 125280),
327
- "Indic Siyaq Numbers": range(126064, 126144),
328
- "Ottoman Siyaq Numbers": range(126208, 126288),
329
- "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
330
- "Mahjong Tiles": range(126976, 127024),
331
- "Domino Tiles": range(127024, 127136),
332
- "Playing Cards": range(127136, 127232),
333
- "Enclosed Alphanumeric Supplement": range(127232, 127488),
334
- "Enclosed Ideographic Supplement": range(127488, 127744),
335
- "Miscellaneous Symbols and Pictographs": range(127744, 128512),
336
- "Emoticons range(Emoji)": range(128512, 128592),
337
- "Ornamental Dingbats": range(128592, 128640),
338
- "Transport and Map Symbols": range(128640, 128768),
339
- "Alchemical Symbols": range(128768, 128896),
340
- "Geometric Shapes Extended": range(128896, 129024),
341
- "Supplemental Arrows-C": range(129024, 129280),
342
- "Supplemental Symbols and Pictographs": range(129280, 129536),
343
- "Chess Symbols": range(129536, 129648),
344
- "Symbols and Pictographs Extended-A": range(129648, 129792),
345
- "Symbols for Legacy Computing": range(129792, 130048),
346
- "CJK Unified Ideographs Extension B": range(131072, 173792),
347
- "CJK Unified Ideographs Extension C": range(173824, 177984),
348
- "CJK Unified Ideographs Extension D": range(177984, 178208),
349
- "CJK Unified Ideographs Extension E": range(178208, 183984),
350
- "CJK Unified Ideographs Extension F": range(183984, 191472),
351
- "CJK Compatibility Ideographs Supplement": range(194560, 195104),
352
- "CJK Unified Ideographs Extension G": range(196608, 201552),
353
- "CJK Unified Ideographs Extension H": range(201552, 205744),
354
- "Tags": range(917504, 917632),
355
- "Variation Selectors Supplement": range(917760, 918000),
356
- "Supplementary Private Use Area-A": range(983040, 1048576),
357
- "Supplementary Private Use Area-B": range(1048576, 1114112),
358
- }
359
-
360
-
361
- UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
362
- "Supplement",
363
- "Extended",
364
- "Extensions",
365
- "Modifier",
366
- "Marks",
367
- "Punctuation",
368
- "Symbols",
369
- "Forms",
370
- "Operators",
371
- "Miscellaneous",
372
- "Drawing",
373
- "Block",
374
- "Shapes",
375
- "Supplemental",
376
- "Tags",
377
- ]
378
-
379
- RE_POSSIBLE_ENCODING_INDICATION = re_compile(
380
- r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
381
- IGNORECASE,
382
- )
383
-
384
- IANA_NO_ALIASES = [
385
- "cp720",
386
- "cp737",
387
- "cp856",
388
- "cp874",
389
- "cp875",
390
- "cp1006",
391
- "koi8_r",
392
- "koi8_t",
393
- "koi8_u",
394
- ]
395
-
396
- IANA_SUPPORTED: list[str] = sorted(
397
- filter(
398
- lambda x: x.endswith("_codec") is False
399
- and x not in {"rot_13", "tactis", "mbcs"},
400
- list(set(aliases.values())) + IANA_NO_ALIASES,
401
- )
402
- )
403
-
404
- IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
405
-
406
- # pre-computed code page that are similar using the function cp_similarity.
407
- IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
408
- "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
409
- "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
410
- "cp1125": ["cp866"],
411
- "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
412
- "cp1250": ["iso8859_2"],
413
- "cp1251": ["kz1048", "ptcp154"],
414
- "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
415
- "cp1253": ["iso8859_7"],
416
- "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
417
- "cp1257": ["iso8859_13"],
418
- "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
419
- "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
420
- "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
421
- "cp850": ["cp437", "cp857", "cp858", "cp865"],
422
- "cp857": ["cp850", "cp858", "cp865"],
423
- "cp858": ["cp437", "cp850", "cp857", "cp865"],
424
- "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
425
- "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
426
- "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
427
- "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
428
- "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
429
- "cp866": ["cp1125"],
430
- "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
431
- "iso8859_11": ["tis_620"],
432
- "iso8859_13": ["cp1257"],
433
- "iso8859_14": [
434
- "iso8859_10",
435
- "iso8859_15",
436
- "iso8859_16",
437
- "iso8859_3",
438
- "iso8859_9",
439
- "latin_1",
440
- ],
441
- "iso8859_15": [
442
- "cp1252",
443
- "cp1254",
444
- "iso8859_10",
445
- "iso8859_14",
446
- "iso8859_16",
447
- "iso8859_3",
448
- "iso8859_9",
449
- "latin_1",
450
- ],
451
- "iso8859_16": [
452
- "iso8859_14",
453
- "iso8859_15",
454
- "iso8859_2",
455
- "iso8859_3",
456
- "iso8859_9",
457
- "latin_1",
458
- ],
459
- "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
460
- "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
461
- "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
462
- "iso8859_7": ["cp1253"],
463
- "iso8859_9": [
464
- "cp1252",
465
- "cp1254",
466
- "cp1258",
467
- "iso8859_10",
468
- "iso8859_14",
469
- "iso8859_15",
470
- "iso8859_16",
471
- "iso8859_3",
472
- "iso8859_4",
473
- "latin_1",
474
- ],
475
- "kz1048": ["cp1251", "ptcp154"],
476
- "latin_1": [
477
- "cp1252",
478
- "cp1254",
479
- "cp1258",
480
- "iso8859_10",
481
- "iso8859_14",
482
- "iso8859_15",
483
- "iso8859_16",
484
- "iso8859_3",
485
- "iso8859_4",
486
- "iso8859_9",
487
- ],
488
- "mac_iceland": ["mac_roman", "mac_turkish"],
489
- "mac_roman": ["mac_iceland", "mac_turkish"],
490
- "mac_turkish": ["mac_iceland", "mac_roman"],
491
- "ptcp154": ["cp1251", "kz1048"],
492
- "tis_620": ["iso8859_11"],
493
- }
494
-
495
-
496
- CHARDET_CORRESPONDENCE: dict[str, str] = {
497
- "iso2022_kr": "ISO-2022-KR",
498
- "iso2022_jp": "ISO-2022-JP",
499
- "euc_kr": "EUC-KR",
500
- "tis_620": "TIS-620",
501
- "utf_32": "UTF-32",
502
- "euc_jp": "EUC-JP",
503
- "koi8_r": "KOI8-R",
504
- "iso8859_1": "ISO-8859-1",
505
- "iso8859_2": "ISO-8859-2",
506
- "iso8859_5": "ISO-8859-5",
507
- "iso8859_6": "ISO-8859-6",
508
- "iso8859_7": "ISO-8859-7",
509
- "iso8859_8": "ISO-8859-8",
510
- "utf_16": "UTF-16",
511
- "cp855": "IBM855",
512
- "mac_cyrillic": "MacCyrillic",
513
- "gb2312": "GB2312",
514
- "gb18030": "GB18030",
515
- "cp932": "CP932",
516
- "cp866": "IBM866",
517
- "utf_8": "utf-8",
518
- "utf_8_sig": "UTF-8-SIG",
519
- "shift_jis": "SHIFT_JIS",
520
- "big5": "Big5",
521
- "cp1250": "windows-1250",
522
- "cp1251": "windows-1251",
523
- "cp1252": "Windows-1252",
524
- "cp1253": "windows-1253",
525
- "cp1255": "windows-1255",
526
- "cp1256": "windows-1256",
527
- "cp1254": "Windows-1254",
528
- "cp949": "CP949",
529
- }
530
-
531
-
532
- COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
533
- "<",
534
- ">",
535
- "=",
536
- ":",
537
- "/",
538
- "&",
539
- ";",
540
- "{",
541
- "}",
542
- "[",
543
- "]",
544
- ",",
545
- "|",
546
- '"',
547
- "-",
548
- "(",
549
- ")",
550
- }
551
-
552
-
553
- KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
554
- ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
555
-
556
- # Logging LEVEL below DEBUG
557
- TRACE: int = 5
558
-
559
-
560
- # Language label that contain the em dash "—"
561
- # character are to be considered alternative seq to origin
562
- FREQUENCIES: dict[str, list[str]] = {
563
- "English": [
564
- "e",
565
- "a",
566
- "t",
567
- "i",
568
- "o",
569
- "n",
570
- "s",
571
- "r",
572
- "h",
573
- "l",
574
- "d",
575
- "c",
576
- "u",
577
- "m",
578
- "f",
579
- "p",
580
- "g",
581
- "w",
582
- "y",
583
- "b",
584
- "v",
585
- "k",
586
- "x",
587
- "j",
588
- "z",
589
- "q",
590
- ],
591
- "English—": [
592
- "e",
593
- "a",
594
- "t",
595
- "i",
596
- "o",
597
- "n",
598
- "s",
599
- "r",
600
- "h",
601
- "l",
602
- "d",
603
- "c",
604
- "m",
605
- "u",
606
- "f",
607
- "p",
608
- "g",
609
- "w",
610
- "b",
611
- "y",
612
- "v",
613
- "k",
614
- "j",
615
- "x",
616
- "z",
617
- "q",
618
- ],
619
- "German": [
620
- "e",
621
- "n",
622
- "i",
623
- "r",
624
- "s",
625
- "t",
626
- "a",
627
- "d",
628
- "h",
629
- "u",
630
- "l",
631
- "g",
632
- "o",
633
- "c",
634
- "m",
635
- "b",
636
- "f",
637
- "k",
638
- "w",
639
- "z",
640
- "p",
641
- "v",
642
- "ü",
643
- "ä",
644
- "ö",
645
- "j",
646
- ],
647
- "French": [
648
- "e",
649
- "a",
650
- "s",
651
- "n",
652
- "i",
653
- "t",
654
- "r",
655
- "l",
656
- "u",
657
- "o",
658
- "d",
659
- "c",
660
- "p",
661
- "m",
662
- "é",
663
- "v",
664
- "g",
665
- "f",
666
- "b",
667
- "h",
668
- "q",
669
- "à",
670
- "x",
671
- "è",
672
- "y",
673
- "j",
674
- ],
675
- "Dutch": [
676
- "e",
677
- "n",
678
- "a",
679
- "i",
680
- "r",
681
- "t",
682
- "o",
683
- "d",
684
- "s",
685
- "l",
686
- "g",
687
- "h",
688
- "v",
689
- "m",
690
- "u",
691
- "k",
692
- "c",
693
- "p",
694
- "b",
695
- "w",
696
- "j",
697
- "z",
698
- "f",
699
- "y",
700
- "x",
701
- "ë",
702
- ],
703
- "Italian": [
704
- "e",
705
- "i",
706
- "a",
707
- "o",
708
- "n",
709
- "l",
710
- "t",
711
- "r",
712
- "s",
713
- "c",
714
- "d",
715
- "u",
716
- "p",
717
- "m",
718
- "g",
719
- "v",
720
- "f",
721
- "b",
722
- "z",
723
- "h",
724
- "q",
725
- "è",
726
- "à",
727
- "k",
728
- "y",
729
- "ò",
730
- ],
731
- "Polish": [
732
- "a",
733
- "i",
734
- "o",
735
- "e",
736
- "n",
737
- "r",
738
- "z",
739
- "w",
740
- "s",
741
- "c",
742
- "t",
743
- "k",
744
- "y",
745
- "d",
746
- "p",
747
- "m",
748
- "u",
749
- "l",
750
- "j",
751
- "ł",
752
- "g",
753
- "b",
754
- "h",
755
- "ą",
756
- "ę",
757
- "ó",
758
- ],
759
- "Spanish": [
760
- "e",
761
- "a",
762
- "o",
763
- "n",
764
- "s",
765
- "r",
766
- "i",
767
- "l",
768
- "d",
769
- "t",
770
- "c",
771
- "u",
772
- "m",
773
- "p",
774
- "b",
775
- "g",
776
- "v",
777
- "f",
778
- "y",
779
- "ó",
780
- "h",
781
- "q",
782
- "í",
783
- "j",
784
- "z",
785
- "á",
786
- ],
787
- "Russian": [
788
- "о",
789
- "а",
790
- "е",
791
- "и",
792
- "н",
793
- "с",
794
- "т",
795
- "р",
796
- "в",
797
- "л",
798
- "к",
799
- "м",
800
- "д",
801
- "п",
802
- "у",
803
- "г",
804
- "я",
805
- "ы",
806
- "з",
807
- "б",
808
- "й",
809
- "ь",
810
- "ч",
811
- "х",
812
- "ж",
813
- "ц",
814
- ],
815
- # Jap-Kanji
816
- "Japanese": [
817
- "人",
818
- "一",
819
- "大",
820
- "亅",
821
- "丁",
822
- "丨",
823
- "竹",
824
- "笑",
825
- "口",
826
- "日",
827
- "今",
828
- "二",
829
- "彳",
830
- "行",
831
- "十",
832
- "土",
833
- "丶",
834
- "寸",
835
- "寺",
836
- "時",
837
- "乙",
838
- "丿",
839
- "乂",
840
- "气",
841
- "気",
842
- "冂",
843
- "巾",
844
- "亠",
845
- "市",
846
- "目",
847
- "儿",
848
- "見",
849
- "八",
850
- "小",
851
- "凵",
852
- "県",
853
- "月",
854
- "彐",
855
- "門",
856
- "間",
857
- "木",
858
- "東",
859
- "山",
860
- "出",
861
- "本",
862
- "中",
863
- "刀",
864
- "分",
865
- "耳",
866
- "又",
867
- "取",
868
- "最",
869
- "言",
870
- "田",
871
- "心",
872
- "思",
873
- "刂",
874
- "前",
875
- "京",
876
- "尹",
877
- "事",
878
- "生",
879
- "厶",
880
- "云",
881
- "会",
882
- "未",
883
- "来",
884
- "白",
885
- "冫",
886
- "楽",
887
- "灬",
888
- "馬",
889
- "尸",
890
- "尺",
891
- "駅",
892
- "明",
893
- "耂",
894
- "者",
895
- "了",
896
- "阝",
897
- "都",
898
- "高",
899
- "卜",
900
- "占",
901
- "厂",
902
- "广",
903
- "店",
904
- "子",
905
- "申",
906
- "奄",
907
- "亻",
908
- "俺",
909
- "上",
910
- "方",
911
- "冖",
912
- "学",
913
- "衣",
914
- "艮",
915
- "食",
916
- "自",
917
- ],
918
- # Jap-Katakana
919
- "Japanese—": [
920
- "ー",
921
- "ン",
922
- "ス",
923
- "・",
924
- "ル",
925
- "ト",
926
- "リ",
927
- "イ",
928
- "ア",
929
- "ラ",
930
- "ッ",
931
- "ク",
932
- "ド",
933
- "シ",
934
- "レ",
935
- "ジ",
936
- "タ",
937
- "フ",
938
- "ロ",
939
- "カ",
940
- "テ",
941
- "マ",
942
- "ィ",
943
- "グ",
944
- "バ",
945
- "ム",
946
- "プ",
947
- "オ",
948
- "コ",
949
- "デ",
950
- "ニ",
951
- "ウ",
952
- "メ",
953
- "サ",
954
- "ビ",
955
- "ナ",
956
- "ブ",
957
- "ャ",
958
- "エ",
959
- "ュ",
960
- "チ",
961
- "キ",
962
- "ズ",
963
- "ダ",
964
- "パ",
965
- "ミ",
966
- "ェ",
967
- "ョ",
968
- "ハ",
969
- "セ",
970
- "ベ",
971
- "ガ",
972
- "モ",
973
- "ツ",
974
- "ネ",
975
- "ボ",
976
- "ソ",
977
- "ノ",
978
- "ァ",
979
- "ヴ",
980
- "ワ",
981
- "ポ",
982
- "ペ",
983
- "ピ",
984
- "ケ",
985
- "ゴ",
986
- "ギ",
987
- "ザ",
988
- "ホ",
989
- "ゲ",
990
- "ォ",
991
- "ヤ",
992
- "ヒ",
993
- "ユ",
994
- "ヨ",
995
- "ヘ",
996
- "ゼ",
997
- "ヌ",
998
- "ゥ",
999
- "ゾ",
1000
- "ヶ",
1001
- "ヂ",
1002
- "ヲ",
1003
- "ヅ",
1004
- "ヵ",
1005
- "ヱ",
1006
- "ヰ",
1007
- "ヮ",
1008
- "ヽ",
1009
- "゠",
1010
- "ヾ",
1011
- "ヷ",
1012
- "ヿ",
1013
- "ヸ",
1014
- "ヹ",
1015
- "ヺ",
1016
- ],
1017
- # Jap-Hiragana
1018
- "Japanese——": [
1019
- "の",
1020
- "に",
1021
- "る",
1022
- "た",
1023
- "と",
1024
- "は",
1025
- "し",
1026
- "い",
1027
- "を",
1028
- "で",
1029
- "て",
1030
- "が",
1031
- "な",
1032
- "れ",
1033
- "か",
1034
- "ら",
1035
- "さ",
1036
- "っ",
1037
- "り",
1038
- "す",
1039
- "あ",
1040
- "も",
1041
- "こ",
1042
- "ま",
1043
- "う",
1044
- "く",
1045
- "よ",
1046
- "き",
1047
- "ん",
1048
- "め",
1049
- "お",
1050
- "け",
1051
- "そ",
1052
- "つ",
1053
- "だ",
1054
- "や",
1055
- "え",
1056
- "ど",
1057
- "わ",
1058
- "ち",
1059
- "み",
1060
- "せ",
1061
- "じ",
1062
- "ば",
1063
- "へ",
1064
- "び",
1065
- "ず",
1066
- "ろ",
1067
- "ほ",
1068
- "げ",
1069
- "む",
1070
- "べ",
1071
- "ひ",
1072
- "ょ",
1073
- "ゆ",
1074
- "ぶ",
1075
- "ご",
1076
- "ゃ",
1077
- "ね",
1078
- "ふ",
1079
- "ぐ",
1080
- "ぎ",
1081
- "ぼ",
1082
- "ゅ",
1083
- "づ",
1084
- "ざ",
1085
- "ぞ",
1086
- "ぬ",
1087
- "ぜ",
1088
- "ぱ",
1089
- "ぽ",
1090
- "ぷ",
1091
- "ぴ",
1092
- "ぃ",
1093
- "ぁ",
1094
- "ぇ",
1095
- "ぺ",
1096
- "ゞ",
1097
- "ぢ",
1098
- "ぉ",
1099
- "ぅ",
1100
- "ゐ",
1101
- "ゝ",
1102
- "ゑ",
1103
- "゛",
1104
- "゜",
1105
- "ゎ",
1106
- "ゔ",
1107
- "゚",
1108
- "ゟ",
1109
- "゙",
1110
- "ゕ",
1111
- "ゖ",
1112
- ],
1113
- "Portuguese": [
1114
- "a",
1115
- "e",
1116
- "o",
1117
- "s",
1118
- "i",
1119
- "r",
1120
- "d",
1121
- "n",
1122
- "t",
1123
- "m",
1124
- "u",
1125
- "c",
1126
- "l",
1127
- "p",
1128
- "g",
1129
- "v",
1130
- "b",
1131
- "f",
1132
- "h",
1133
- "ã",
1134
- "q",
1135
- "é",
1136
- "ç",
1137
- "á",
1138
- "z",
1139
- "í",
1140
- ],
1141
- "Swedish": [
1142
- "e",
1143
- "a",
1144
- "n",
1145
- "r",
1146
- "t",
1147
- "s",
1148
- "i",
1149
- "l",
1150
- "d",
1151
- "o",
1152
- "m",
1153
- "k",
1154
- "g",
1155
- "v",
1156
- "h",
1157
- "f",
1158
- "u",
1159
- "p",
1160
- "ä",
1161
- "c",
1162
- "b",
1163
- "ö",
1164
- "å",
1165
- "y",
1166
- "j",
1167
- "x",
1168
- ],
1169
- "Chinese": [
1170
- "的",
1171
- "一",
1172
- "是",
1173
- "不",
1174
- "了",
1175
- "在",
1176
- "人",
1177
- "有",
1178
- "我",
1179
- "他",
1180
- "这",
1181
- "个",
1182
- "们",
1183
- "中",
1184
- "来",
1185
- "上",
1186
- "大",
1187
- "为",
1188
- "和",
1189
- "国",
1190
- "地",
1191
- "到",
1192
- "以",
1193
- "说",
1194
- "时",
1195
- "要",
1196
- "就",
1197
- "出",
1198
- "会",
1199
- "可",
1200
- "也",
1201
- "你",
1202
- "对",
1203
- "生",
1204
- "能",
1205
- "而",
1206
- "子",
1207
- "那",
1208
- "得",
1209
- "于",
1210
- "着",
1211
- "下",
1212
- "自",
1213
- "之",
1214
- "年",
1215
- "过",
1216
- "发",
1217
- "后",
1218
- "作",
1219
- "里",
1220
- "用",
1221
- "道",
1222
- "行",
1223
- "所",
1224
- "然",
1225
- "家",
1226
- "种",
1227
- "事",
1228
- "成",
1229
- "方",
1230
- "多",
1231
- "经",
1232
- "么",
1233
- "去",
1234
- "法",
1235
- "学",
1236
- "如",
1237
- "都",
1238
- "同",
1239
- "现",
1240
- "当",
1241
- "没",
1242
- "动",
1243
- "面",
1244
- "起",
1245
- "看",
1246
- "定",
1247
- "天",
1248
- "分",
1249
- "还",
1250
- "进",
1251
- "好",
1252
- "小",
1253
- "部",
1254
- "其",
1255
- "些",
1256
- "主",
1257
- "样",
1258
- "理",
1259
- "心",
1260
- "她",
1261
- "本",
1262
- "前",
1263
- "开",
1264
- "但",
1265
- "因",
1266
- "只",
1267
- "从",
1268
- "想",
1269
- "实",
1270
- ],
1271
- "Ukrainian": [
1272
- "о",
1273
- "а",
1274
- "н",
1275
- "і",
1276
- "и",
1277
- "р",
1278
- "в",
1279
- "т",
1280
- "е",
1281
- "с",
1282
- "к",
1283
- "л",
1284
- "у",
1285
- "д",
1286
- "м",
1287
- "п",
1288
- "з",
1289
- "я",
1290
- "ь",
1291
- "б",
1292
- "г",
1293
- "й",
1294
- "ч",
1295
- "х",
1296
- "ц",
1297
- "ї",
1298
- ],
1299
- "Norwegian": [
1300
- "e",
1301
- "r",
1302
- "n",
1303
- "t",
1304
- "a",
1305
- "s",
1306
- "i",
1307
- "o",
1308
- "l",
1309
- "d",
1310
- "g",
1311
- "k",
1312
- "m",
1313
- "v",
1314
- "f",
1315
- "p",
1316
- "u",
1317
- "b",
1318
- "h",
1319
- "å",
1320
- "y",
1321
- "j",
1322
- "ø",
1323
- "c",
1324
- "æ",
1325
- "w",
1326
- ],
1327
- "Finnish": [
1328
- "a",
1329
- "i",
1330
- "n",
1331
- "t",
1332
- "e",
1333
- "s",
1334
- "l",
1335
- "o",
1336
- "u",
1337
- "k",
1338
- "ä",
1339
- "m",
1340
- "r",
1341
- "v",
1342
- "j",
1343
- "h",
1344
- "p",
1345
- "y",
1346
- "d",
1347
- "ö",
1348
- "g",
1349
- "c",
1350
- "b",
1351
- "f",
1352
- "w",
1353
- "z",
1354
- ],
1355
- "Vietnamese": [
1356
- "n",
1357
- "h",
1358
- "t",
1359
- "i",
1360
- "c",
1361
- "g",
1362
- "a",
1363
- "o",
1364
- "u",
1365
- "m",
1366
- "l",
1367
- "r",
1368
- "à",
1369
- "đ",
1370
- "s",
1371
- "e",
1372
- "v",
1373
- "p",
1374
- "b",
1375
- "y",
1376
- "ư",
1377
- "d",
1378
- "á",
1379
- "k",
1380
- "ộ",
1381
- "ế",
1382
- ],
1383
- "Czech": [
1384
- "o",
1385
- "e",
1386
- "a",
1387
- "n",
1388
- "t",
1389
- "s",
1390
- "i",
1391
- "l",
1392
- "v",
1393
- "r",
1394
- "k",
1395
- "d",
1396
- "u",
1397
- "m",
1398
- "p",
1399
- "í",
1400
- "c",
1401
- "h",
1402
- "z",
1403
- "á",
1404
- "y",
1405
- "j",
1406
- "b",
1407
- "ě",
1408
- "é",
1409
- "ř",
1410
- ],
1411
- "Hungarian": [
1412
- "e",
1413
- "a",
1414
- "t",
1415
- "l",
1416
- "s",
1417
- "n",
1418
- "k",
1419
- "r",
1420
- "i",
1421
- "o",
1422
- "z",
1423
- "á",
1424
- "é",
1425
- "g",
1426
- "m",
1427
- "b",
1428
- "y",
1429
- "v",
1430
- "d",
1431
- "h",
1432
- "u",
1433
- "p",
1434
- "j",
1435
- "ö",
1436
- "f",
1437
- "c",
1438
- ],
1439
- "Korean": [
1440
- "이",
1441
- "다",
1442
- "에",
1443
- "의",
1444
- "는",
1445
- "로",
1446
- "하",
1447
- "을",
1448
- "가",
1449
- "고",
1450
- "지",
1451
- "서",
1452
- "한",
1453
- "은",
1454
- "기",
1455
- "으",
1456
- "년",
1457
- "대",
1458
- "사",
1459
- "시",
1460
- "를",
1461
- "리",
1462
- "도",
1463
- "인",
1464
- "스",
1465
- "일",
1466
- ],
1467
- "Indonesian": [
1468
- "a",
1469
- "n",
1470
- "e",
1471
- "i",
1472
- "r",
1473
- "t",
1474
- "u",
1475
- "s",
1476
- "d",
1477
- "k",
1478
- "m",
1479
- "l",
1480
- "g",
1481
- "p",
1482
- "b",
1483
- "o",
1484
- "h",
1485
- "y",
1486
- "j",
1487
- "c",
1488
- "w",
1489
- "f",
1490
- "v",
1491
- "z",
1492
- "x",
1493
- "q",
1494
- ],
1495
- "Turkish": [
1496
- "a",
1497
- "e",
1498
- "i",
1499
- "n",
1500
- "r",
1501
- "l",
1502
- "ı",
1503
- "k",
1504
- "d",
1505
- "t",
1506
- "s",
1507
- "m",
1508
- "y",
1509
- "u",
1510
- "o",
1511
- "b",
1512
- "ü",
1513
- "ş",
1514
- "v",
1515
- "g",
1516
- "z",
1517
- "h",
1518
- "c",
1519
- "p",
1520
- "ç",
1521
- "ğ",
1522
- ],
1523
- "Romanian": [
1524
- "e",
1525
- "i",
1526
- "a",
1527
- "r",
1528
- "n",
1529
- "t",
1530
- "u",
1531
- "l",
1532
- "o",
1533
- "c",
1534
- "s",
1535
- "d",
1536
- "p",
1537
- "m",
1538
- "ă",
1539
- "f",
1540
- "v",
1541
- "î",
1542
- "g",
1543
- "b",
1544
- "ș",
1545
- "ț",
1546
- "z",
1547
- "h",
1548
- "â",
1549
- "j",
1550
- ],
1551
- "Farsi": [
1552
- "ا",
1553
- "ی",
1554
- "ر",
1555
- "د",
1556
- "ن",
1557
- "ه",
1558
- "و",
1559
- "م",
1560
- "ت",
1561
- "ب",
1562
- "س",
1563
- "ل",
1564
- "ک",
1565
- "ش",
1566
- "ز",
1567
- "ف",
1568
- "گ",
1569
- "ع",
1570
- "خ",
1571
- "ق",
1572
- "ج",
1573
- "آ",
1574
- "پ",
1575
- "ح",
1576
- "ط",
1577
- "ص",
1578
- ],
1579
- "Arabic": [
1580
- "ا",
1581
- "ل",
1582
- "ي",
1583
- "م",
1584
- "و",
1585
- "ن",
1586
- "ر",
1587
- "ت",
1588
- "ب",
1589
- "ة",
1590
- "ع",
1591
- "د",
1592
- "س",
1593
- "ف",
1594
- "ه",
1595
- "ك",
1596
- "ق",
1597
- "أ",
1598
- "ح",
1599
- "ج",
1600
- "ش",
1601
- "ط",
1602
- "ص",
1603
- "ى",
1604
- "خ",
1605
- "إ",
1606
- ],
1607
- "Danish": [
1608
- "e",
1609
- "r",
1610
- "n",
1611
- "t",
1612
- "a",
1613
- "i",
1614
- "s",
1615
- "d",
1616
- "l",
1617
- "o",
1618
- "g",
1619
- "m",
1620
- "k",
1621
- "f",
1622
- "v",
1623
- "u",
1624
- "b",
1625
- "h",
1626
- "p",
1627
- "å",
1628
- "y",
1629
- "ø",
1630
- "æ",
1631
- "c",
1632
- "j",
1633
- "w",
1634
- ],
1635
- "Serbian": [
1636
- "а",
1637
- "и",
1638
- "о",
1639
- "е",
1640
- "н",
1641
- "р",
1642
- "с",
1643
- "у",
1644
- "т",
1645
- "к",
1646
- "ј",
1647
- "в",
1648
- "д",
1649
- "м",
1650
- "п",
1651
- "л",
1652
- "г",
1653
- "з",
1654
- "б",
1655
- "a",
1656
- "i",
1657
- "e",
1658
- "o",
1659
- "n",
1660
- "ц",
1661
- "ш",
1662
- ],
1663
- "Lithuanian": [
1664
- "i",
1665
- "a",
1666
- "s",
1667
- "o",
1668
- "r",
1669
- "e",
1670
- "t",
1671
- "n",
1672
- "u",
1673
- "k",
1674
- "m",
1675
- "l",
1676
- "p",
1677
- "v",
1678
- "d",
1679
- "j",
1680
- "g",
1681
- "ė",
1682
- "b",
1683
- "y",
1684
- "ų",
1685
- "š",
1686
- "ž",
1687
- "c",
1688
- "ą",
1689
- "į",
1690
- ],
1691
- "Slovene": [
1692
- "e",
1693
- "a",
1694
- "i",
1695
- "o",
1696
- "n",
1697
- "r",
1698
- "s",
1699
- "l",
1700
- "t",
1701
- "j",
1702
- "v",
1703
- "k",
1704
- "d",
1705
- "p",
1706
- "m",
1707
- "u",
1708
- "z",
1709
- "b",
1710
- "g",
1711
- "h",
1712
- "č",
1713
- "c",
1714
- "š",
1715
- "ž",
1716
- "f",
1717
- "y",
1718
- ],
1719
- "Slovak": [
1720
- "o",
1721
- "a",
1722
- "e",
1723
- "n",
1724
- "i",
1725
- "r",
1726
- "v",
1727
- "t",
1728
- "s",
1729
- "l",
1730
- "k",
1731
- "d",
1732
- "m",
1733
- "p",
1734
- "u",
1735
- "c",
1736
- "h",
1737
- "j",
1738
- "b",
1739
- "z",
1740
- "á",
1741
- "y",
1742
- "ý",
1743
- "í",
1744
- "č",
1745
- "é",
1746
- ],
1747
- "Hebrew": [
1748
- "י",
1749
- "ו",
1750
- "ה",
1751
- "ל",
1752
- "ר",
1753
- "ב",
1754
- "ת",
1755
- "מ",
1756
- "א",
1757
- "ש",
1758
- "נ",
1759
- "ע",
1760
- "ם",
1761
- "ד",
1762
- "ק",
1763
- "ח",
1764
- "פ",
1765
- "ס",
1766
- "כ",
1767
- "ג",
1768
- "ט",
1769
- "צ",
1770
- "ן",
1771
- "ז",
1772
- "ך",
1773
- ],
1774
- "Bulgarian": [
1775
- "а",
1776
- "и",
1777
- "о",
1778
- "е",
1779
- "н",
1780
- "т",
1781
- "р",
1782
- "с",
1783
- "в",
1784
- "л",
1785
- "к",
1786
- "д",
1787
- "п",
1788
- "м",
1789
- "з",
1790
- "г",
1791
- "я",
1792
- "ъ",
1793
- "у",
1794
- "б",
1795
- "ч",
1796
- "ц",
1797
- "й",
1798
- "ж",
1799
- "щ",
1800
- "х",
1801
- ],
1802
- "Croatian": [
1803
- "a",
1804
- "i",
1805
- "o",
1806
- "e",
1807
- "n",
1808
- "r",
1809
- "j",
1810
- "s",
1811
- "t",
1812
- "u",
1813
- "k",
1814
- "l",
1815
- "v",
1816
- "d",
1817
- "m",
1818
- "p",
1819
- "g",
1820
- "z",
1821
- "b",
1822
- "c",
1823
- "č",
1824
- "h",
1825
- "š",
1826
- "ž",
1827
- "ć",
1828
- "f",
1829
- ],
1830
- "Hindi": [
1831
- "क",
1832
- "र",
1833
- "स",
1834
- "न",
1835
- "त",
1836
- "म",
1837
- "ह",
1838
- "प",
1839
- "य",
1840
- "ल",
1841
- "व",
1842
- "ज",
1843
- "द",
1844
- "ग",
1845
- "ब",
1846
- "श",
1847
- "ट",
1848
- "अ",
1849
- "ए",
1850
- "थ",
1851
- "भ",
1852
- "ड",
1853
- "च",
1854
- "ध",
1855
- "ष",
1856
- "इ",
1857
- ],
1858
- "Estonian": [
1859
- "a",
1860
- "i",
1861
- "e",
1862
- "s",
1863
- "t",
1864
- "l",
1865
- "u",
1866
- "n",
1867
- "o",
1868
- "k",
1869
- "r",
1870
- "d",
1871
- "m",
1872
- "v",
1873
- "g",
1874
- "p",
1875
- "j",
1876
- "h",
1877
- "ä",
1878
- "b",
1879
- "õ",
1880
- "ü",
1881
- "f",
1882
- "c",
1883
- "ö",
1884
- "y",
1885
- ],
1886
- "Thai": [
1887
- "า",
1888
- "น",
1889
- "ร",
1890
- "อ",
1891
- "ก",
1892
- "เ",
1893
- "ง",
1894
- "ม",
1895
- "ย",
1896
- "ล",
1897
- "ว",
1898
- "ด",
1899
- "ท",
1900
- "ส",
1901
- "ต",
1902
- "ะ",
1903
- "ป",
1904
- "บ",
1905
- "ค",
1906
- "ห",
1907
- "แ",
1908
- "จ",
1909
- "พ",
1910
- "ช",
1911
- "ข",
1912
- "ใ",
1913
- ],
1914
- "Greek": [
1915
- "α",
1916
- "τ",
1917
- "ο",
1918
- "ι",
1919
- "ε",
1920
- "ν",
1921
- "ρ",
1922
- "σ",
1923
- "κ",
1924
- "η",
1925
- "π",
1926
- "ς",
1927
- "υ",
1928
- "μ",
1929
- "λ",
1930
- "ί",
1931
- "ό",
1932
- "ά",
1933
- "γ",
1934
- "έ",
1935
- "δ",
1936
- "ή",
1937
- "ω",
1938
- "χ",
1939
- "θ",
1940
- "ύ",
1941
- ],
1942
- "Tamil": [
1943
- "க",
1944
- "த",
1945
- "ப",
1946
- "ட",
1947
- "ர",
1948
- "ம",
1949
- "ல",
1950
- "ன",
1951
- "வ",
1952
- "ற",
1953
- "ய",
1954
- "ள",
1955
- "ச",
1956
- "ந",
1957
- "இ",
1958
- "ண",
1959
- "அ",
1960
- "ஆ",
1961
- "ழ",
1962
- "ங",
1963
- "எ",
1964
- "உ",
1965
- "ஒ",
1966
- "ஸ",
1967
- ],
1968
- "Kazakh": [
1969
- "а",
1970
- "ы",
1971
- "е",
1972
- "н",
1973
- "т",
1974
- "р",
1975
- "л",
1976
- "і",
1977
- "д",
1978
- "с",
1979
- "м",
1980
- "қ",
1981
- "к",
1982
- "о",
1983
- "б",
1984
- "и",
1985
- "у",
1986
- "ғ",
1987
- "ж",
1988
- "ң",
1989
- "з",
1990
- "ш",
1991
- "й",
1992
- "п",
1993
- "г",
1994
- "ө",
1995
- ],
1996
- }
1997
-
1998
- LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/legacy.py DELETED
@@ -1,66 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Any
4
- from warnings import warn
5
-
6
- from .api import from_bytes
7
- from .constant import CHARDET_CORRESPONDENCE
8
-
9
- # TODO: remove this check when dropping Python 3.7 support
10
- if TYPE_CHECKING:
11
- from typing_extensions import TypedDict
12
-
13
- class ResultDict(TypedDict):
14
- encoding: str | None
15
- language: str
16
- confidence: float | None
17
-
18
-
19
- def detect(
20
- byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
- ) -> ResultDict:
22
- """
23
- chardet legacy method
24
- Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
- Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
- This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
- further information. Not planned for removal.
28
-
29
- :param byte_str: The byte sequence to examine.
30
- :param should_rename_legacy: Should we rename legacy encodings
31
- to their more modern equivalents?
32
- """
33
- if len(kwargs):
34
- warn(
35
- f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
- )
37
-
38
- if not isinstance(byte_str, (bytearray, bytes)):
39
- raise TypeError( # pragma: nocover
40
- "Expected object of type bytes or bytearray, got: " "{}".format(
41
- type(byte_str)
42
- )
43
- )
44
-
45
- if isinstance(byte_str, bytearray):
46
- byte_str = bytes(byte_str)
47
-
48
- r = from_bytes(byte_str).best()
49
-
50
- encoding = r.encoding if r is not None else None
51
- language = r.language if r is not None and r.language != "Unknown" else ""
52
- confidence = 1.0 - r.chaos if r is not None else None
53
-
54
- # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
55
- # but chardet does return 'utf-8-sig' and it is a valid codec name.
56
- if r is not None and encoding == "utf_8" and r.bom:
57
- encoding += "_sig"
58
-
59
- if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
60
- encoding = CHARDET_CORRESPONDENCE[encoding]
61
-
62
- return {
63
- "encoding": encoding,
64
- "language": language,
65
- "confidence": confidence,
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/md.py DELETED
@@ -1,630 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from functools import lru_cache
4
- from logging import getLogger
5
-
6
- from .constant import (
7
- COMMON_SAFE_ASCII_CHARACTERS,
8
- TRACE,
9
- UNICODE_SECONDARY_RANGE_KEYWORD,
10
- )
11
- from .utils import (
12
- is_accentuated,
13
- is_arabic,
14
- is_arabic_isolated_form,
15
- is_case_variable,
16
- is_cjk,
17
- is_emoticon,
18
- is_hangul,
19
- is_hiragana,
20
- is_katakana,
21
- is_latin,
22
- is_punctuation,
23
- is_separator,
24
- is_symbol,
25
- is_thai,
26
- is_unprintable,
27
- remove_accent,
28
- unicode_range,
29
- )
30
-
31
-
32
- class MessDetectorPlugin:
33
- """
34
- Base abstract class used for mess detection plugins.
35
- All detectors MUST extend and implement given methods.
36
- """
37
-
38
- def eligible(self, character: str) -> bool:
39
- """
40
- Determine if given character should be fed in.
41
- """
42
- raise NotImplementedError # pragma: nocover
43
-
44
- def feed(self, character: str) -> None:
45
- """
46
- The main routine to be executed upon character.
47
- Insert the logic in witch the text would be considered chaotic.
48
- """
49
- raise NotImplementedError # pragma: nocover
50
-
51
- def reset(self) -> None: # pragma: no cover
52
- """
53
- Permit to reset the plugin to the initial state.
54
- """
55
- raise NotImplementedError
56
-
57
- @property
58
- def ratio(self) -> float:
59
- """
60
- Compute the chaos ratio based on what your feed() has seen.
61
- Must NOT be lower than 0.; No restriction gt 0.
62
- """
63
- raise NotImplementedError # pragma: nocover
64
-
65
-
66
- class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
67
- def __init__(self) -> None:
68
- self._punctuation_count: int = 0
69
- self._symbol_count: int = 0
70
- self._character_count: int = 0
71
-
72
- self._last_printable_char: str | None = None
73
- self._frenzy_symbol_in_word: bool = False
74
-
75
- def eligible(self, character: str) -> bool:
76
- return character.isprintable()
77
-
78
- def feed(self, character: str) -> None:
79
- self._character_count += 1
80
-
81
- if (
82
- character != self._last_printable_char
83
- and character not in COMMON_SAFE_ASCII_CHARACTERS
84
- ):
85
- if is_punctuation(character):
86
- self._punctuation_count += 1
87
- elif (
88
- character.isdigit() is False
89
- and is_symbol(character)
90
- and is_emoticon(character) is False
91
- ):
92
- self._symbol_count += 2
93
-
94
- self._last_printable_char = character
95
-
96
- def reset(self) -> None: # Abstract
97
- self._punctuation_count = 0
98
- self._character_count = 0
99
- self._symbol_count = 0
100
-
101
- @property
102
- def ratio(self) -> float:
103
- if self._character_count == 0:
104
- return 0.0
105
-
106
- ratio_of_punctuation: float = (
107
- self._punctuation_count + self._symbol_count
108
- ) / self._character_count
109
-
110
- return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
111
-
112
-
113
- class TooManyAccentuatedPlugin(MessDetectorPlugin):
114
- def __init__(self) -> None:
115
- self._character_count: int = 0
116
- self._accentuated_count: int = 0
117
-
118
- def eligible(self, character: str) -> bool:
119
- return character.isalpha()
120
-
121
- def feed(self, character: str) -> None:
122
- self._character_count += 1
123
-
124
- if is_accentuated(character):
125
- self._accentuated_count += 1
126
-
127
- def reset(self) -> None: # Abstract
128
- self._character_count = 0
129
- self._accentuated_count = 0
130
-
131
- @property
132
- def ratio(self) -> float:
133
- if self._character_count < 8:
134
- return 0.0
135
-
136
- ratio_of_accentuation: float = self._accentuated_count / self._character_count
137
- return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
138
-
139
-
140
- class UnprintablePlugin(MessDetectorPlugin):
141
- def __init__(self) -> None:
142
- self._unprintable_count: int = 0
143
- self._character_count: int = 0
144
-
145
- def eligible(self, character: str) -> bool:
146
- return True
147
-
148
- def feed(self, character: str) -> None:
149
- if is_unprintable(character):
150
- self._unprintable_count += 1
151
- self._character_count += 1
152
-
153
- def reset(self) -> None: # Abstract
154
- self._unprintable_count = 0
155
-
156
- @property
157
- def ratio(self) -> float:
158
- if self._character_count == 0:
159
- return 0.0
160
-
161
- return (self._unprintable_count * 8) / self._character_count
162
-
163
-
164
- class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
165
- def __init__(self) -> None:
166
- self._successive_count: int = 0
167
- self._character_count: int = 0
168
-
169
- self._last_latin_character: str | None = None
170
-
171
- def eligible(self, character: str) -> bool:
172
- return character.isalpha() and is_latin(character)
173
-
174
- def feed(self, character: str) -> None:
175
- self._character_count += 1
176
- if (
177
- self._last_latin_character is not None
178
- and is_accentuated(character)
179
- and is_accentuated(self._last_latin_character)
180
- ):
181
- if character.isupper() and self._last_latin_character.isupper():
182
- self._successive_count += 1
183
- # Worse if its the same char duplicated with different accent.
184
- if remove_accent(character) == remove_accent(self._last_latin_character):
185
- self._successive_count += 1
186
- self._last_latin_character = character
187
-
188
- def reset(self) -> None: # Abstract
189
- self._successive_count = 0
190
- self._character_count = 0
191
- self._last_latin_character = None
192
-
193
- @property
194
- def ratio(self) -> float:
195
- if self._character_count == 0:
196
- return 0.0
197
-
198
- return (self._successive_count * 2) / self._character_count
199
-
200
-
201
- class SuspiciousRange(MessDetectorPlugin):
202
- def __init__(self) -> None:
203
- self._suspicious_successive_range_count: int = 0
204
- self._character_count: int = 0
205
- self._last_printable_seen: str | None = None
206
-
207
- def eligible(self, character: str) -> bool:
208
- return character.isprintable()
209
-
210
- def feed(self, character: str) -> None:
211
- self._character_count += 1
212
-
213
- if (
214
- character.isspace()
215
- or is_punctuation(character)
216
- or character in COMMON_SAFE_ASCII_CHARACTERS
217
- ):
218
- self._last_printable_seen = None
219
- return
220
-
221
- if self._last_printable_seen is None:
222
- self._last_printable_seen = character
223
- return
224
-
225
- unicode_range_a: str | None = unicode_range(self._last_printable_seen)
226
- unicode_range_b: str | None = unicode_range(character)
227
-
228
- if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
229
- self._suspicious_successive_range_count += 1
230
-
231
- self._last_printable_seen = character
232
-
233
- def reset(self) -> None: # Abstract
234
- self._character_count = 0
235
- self._suspicious_successive_range_count = 0
236
- self._last_printable_seen = None
237
-
238
- @property
239
- def ratio(self) -> float:
240
- if self._character_count <= 13:
241
- return 0.0
242
-
243
- ratio_of_suspicious_range_usage: float = (
244
- self._suspicious_successive_range_count * 2
245
- ) / self._character_count
246
-
247
- return ratio_of_suspicious_range_usage
248
-
249
-
250
- class SuperWeirdWordPlugin(MessDetectorPlugin):
251
- def __init__(self) -> None:
252
- self._word_count: int = 0
253
- self._bad_word_count: int = 0
254
- self._foreign_long_count: int = 0
255
-
256
- self._is_current_word_bad: bool = False
257
- self._foreign_long_watch: bool = False
258
-
259
- self._character_count: int = 0
260
- self._bad_character_count: int = 0
261
-
262
- self._buffer: str = ""
263
- self._buffer_accent_count: int = 0
264
- self._buffer_glyph_count: int = 0
265
-
266
- def eligible(self, character: str) -> bool:
267
- return True
268
-
269
- def feed(self, character: str) -> None:
270
- if character.isalpha():
271
- self._buffer += character
272
- if is_accentuated(character):
273
- self._buffer_accent_count += 1
274
- if (
275
- self._foreign_long_watch is False
276
- and (is_latin(character) is False or is_accentuated(character))
277
- and is_cjk(character) is False
278
- and is_hangul(character) is False
279
- and is_katakana(character) is False
280
- and is_hiragana(character) is False
281
- and is_thai(character) is False
282
- ):
283
- self._foreign_long_watch = True
284
- if (
285
- is_cjk(character)
286
- or is_hangul(character)
287
- or is_katakana(character)
288
- or is_hiragana(character)
289
- or is_thai(character)
290
- ):
291
- self._buffer_glyph_count += 1
292
- return
293
- if not self._buffer:
294
- return
295
- if (
296
- character.isspace() or is_punctuation(character) or is_separator(character)
297
- ) and self._buffer:
298
- self._word_count += 1
299
- buffer_length: int = len(self._buffer)
300
-
301
- self._character_count += buffer_length
302
-
303
- if buffer_length >= 4:
304
- if self._buffer_accent_count / buffer_length >= 0.5:
305
- self._is_current_word_bad = True
306
- # Word/Buffer ending with an upper case accentuated letter are so rare,
307
- # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
308
- elif (
309
- is_accentuated(self._buffer[-1])
310
- and self._buffer[-1].isupper()
311
- and all(_.isupper() for _ in self._buffer) is False
312
- ):
313
- self._foreign_long_count += 1
314
- self._is_current_word_bad = True
315
- elif self._buffer_glyph_count == 1:
316
- self._is_current_word_bad = True
317
- self._foreign_long_count += 1
318
- if buffer_length >= 24 and self._foreign_long_watch:
319
- camel_case_dst = [
320
- i
321
- for c, i in zip(self._buffer, range(0, buffer_length))
322
- if c.isupper()
323
- ]
324
- probable_camel_cased: bool = False
325
-
326
- if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
327
- probable_camel_cased = True
328
-
329
- if not probable_camel_cased:
330
- self._foreign_long_count += 1
331
- self._is_current_word_bad = True
332
-
333
- if self._is_current_word_bad:
334
- self._bad_word_count += 1
335
- self._bad_character_count += len(self._buffer)
336
- self._is_current_word_bad = False
337
-
338
- self._foreign_long_watch = False
339
- self._buffer = ""
340
- self._buffer_accent_count = 0
341
- self._buffer_glyph_count = 0
342
- elif (
343
- character not in {"<", ">", "-", "=", "~", "|", "_"}
344
- and character.isdigit() is False
345
- and is_symbol(character)
346
- ):
347
- self._is_current_word_bad = True
348
- self._buffer += character
349
-
350
- def reset(self) -> None: # Abstract
351
- self._buffer = ""
352
- self._is_current_word_bad = False
353
- self._foreign_long_watch = False
354
- self._bad_word_count = 0
355
- self._word_count = 0
356
- self._character_count = 0
357
- self._bad_character_count = 0
358
- self._foreign_long_count = 0
359
-
360
- @property
361
- def ratio(self) -> float:
362
- if self._word_count <= 10 and self._foreign_long_count == 0:
363
- return 0.0
364
-
365
- return self._bad_character_count / self._character_count
366
-
367
-
368
- class CjkInvalidStopPlugin(MessDetectorPlugin):
369
- """
370
- GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
371
- can be easily detected. Searching for the overuse of '丅' and '丄'.
372
- """
373
-
374
- def __init__(self) -> None:
375
- self._wrong_stop_count: int = 0
376
- self._cjk_character_count: int = 0
377
-
378
- def eligible(self, character: str) -> bool:
379
- return True
380
-
381
- def feed(self, character: str) -> None:
382
- if character in {"丅", "丄"}:
383
- self._wrong_stop_count += 1
384
- return
385
- if is_cjk(character):
386
- self._cjk_character_count += 1
387
-
388
- def reset(self) -> None: # Abstract
389
- self._wrong_stop_count = 0
390
- self._cjk_character_count = 0
391
-
392
- @property
393
- def ratio(self) -> float:
394
- if self._cjk_character_count < 16:
395
- return 0.0
396
- return self._wrong_stop_count / self._cjk_character_count
397
-
398
-
399
- class ArchaicUpperLowerPlugin(MessDetectorPlugin):
400
- def __init__(self) -> None:
401
- self._buf: bool = False
402
-
403
- self._character_count_since_last_sep: int = 0
404
-
405
- self._successive_upper_lower_count: int = 0
406
- self._successive_upper_lower_count_final: int = 0
407
-
408
- self._character_count: int = 0
409
-
410
- self._last_alpha_seen: str | None = None
411
- self._current_ascii_only: bool = True
412
-
413
- def eligible(self, character: str) -> bool:
414
- return True
415
-
416
- def feed(self, character: str) -> None:
417
- is_concerned = character.isalpha() and is_case_variable(character)
418
- chunk_sep = is_concerned is False
419
-
420
- if chunk_sep and self._character_count_since_last_sep > 0:
421
- if (
422
- self._character_count_since_last_sep <= 64
423
- and character.isdigit() is False
424
- and self._current_ascii_only is False
425
- ):
426
- self._successive_upper_lower_count_final += (
427
- self._successive_upper_lower_count
428
- )
429
-
430
- self._successive_upper_lower_count = 0
431
- self._character_count_since_last_sep = 0
432
- self._last_alpha_seen = None
433
- self._buf = False
434
- self._character_count += 1
435
- self._current_ascii_only = True
436
-
437
- return
438
-
439
- if self._current_ascii_only is True and character.isascii() is False:
440
- self._current_ascii_only = False
441
-
442
- if self._last_alpha_seen is not None:
443
- if (character.isupper() and self._last_alpha_seen.islower()) or (
444
- character.islower() and self._last_alpha_seen.isupper()
445
- ):
446
- if self._buf is True:
447
- self._successive_upper_lower_count += 2
448
- self._buf = False
449
- else:
450
- self._buf = True
451
- else:
452
- self._buf = False
453
-
454
- self._character_count += 1
455
- self._character_count_since_last_sep += 1
456
- self._last_alpha_seen = character
457
-
458
- def reset(self) -> None: # Abstract
459
- self._character_count = 0
460
- self._character_count_since_last_sep = 0
461
- self._successive_upper_lower_count = 0
462
- self._successive_upper_lower_count_final = 0
463
- self._last_alpha_seen = None
464
- self._buf = False
465
- self._current_ascii_only = True
466
-
467
- @property
468
- def ratio(self) -> float:
469
- if self._character_count == 0:
470
- return 0.0
471
-
472
- return self._successive_upper_lower_count_final / self._character_count
473
-
474
-
475
- class ArabicIsolatedFormPlugin(MessDetectorPlugin):
476
- def __init__(self) -> None:
477
- self._character_count: int = 0
478
- self._isolated_form_count: int = 0
479
-
480
- def reset(self) -> None: # Abstract
481
- self._character_count = 0
482
- self._isolated_form_count = 0
483
-
484
- def eligible(self, character: str) -> bool:
485
- return is_arabic(character)
486
-
487
- def feed(self, character: str) -> None:
488
- self._character_count += 1
489
-
490
- if is_arabic_isolated_form(character):
491
- self._isolated_form_count += 1
492
-
493
- @property
494
- def ratio(self) -> float:
495
- if self._character_count < 8:
496
- return 0.0
497
-
498
- isolated_form_usage: float = self._isolated_form_count / self._character_count
499
-
500
- return isolated_form_usage
501
-
502
-
503
- @lru_cache(maxsize=1024)
504
- def is_suspiciously_successive_range(
505
- unicode_range_a: str | None, unicode_range_b: str | None
506
- ) -> bool:
507
- """
508
- Determine if two Unicode range seen next to each other can be considered as suspicious.
509
- """
510
- if unicode_range_a is None or unicode_range_b is None:
511
- return True
512
-
513
- if unicode_range_a == unicode_range_b:
514
- return False
515
-
516
- if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
517
- return False
518
-
519
- if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
520
- return False
521
-
522
- # Latin characters can be accompanied with a combining diacritical mark
523
- # eg. Vietnamese.
524
- if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
525
- "Combining" in unicode_range_a or "Combining" in unicode_range_b
526
- ):
527
- return False
528
-
529
- keywords_range_a, keywords_range_b = (
530
- unicode_range_a.split(" "),
531
- unicode_range_b.split(" "),
532
- )
533
-
534
- for el in keywords_range_a:
535
- if el in UNICODE_SECONDARY_RANGE_KEYWORD:
536
- continue
537
- if el in keywords_range_b:
538
- return False
539
-
540
- # Japanese Exception
541
- range_a_jp_chars, range_b_jp_chars = (
542
- unicode_range_a
543
- in (
544
- "Hiragana",
545
- "Katakana",
546
- ),
547
- unicode_range_b in ("Hiragana", "Katakana"),
548
- )
549
- if (range_a_jp_chars or range_b_jp_chars) and (
550
- "CJK" in unicode_range_a or "CJK" in unicode_range_b
551
- ):
552
- return False
553
- if range_a_jp_chars and range_b_jp_chars:
554
- return False
555
-
556
- if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
557
- if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
558
- return False
559
- if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
560
- return False
561
-
562
- # Chinese/Japanese use dedicated range for punctuation and/or separators.
563
- if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
564
- unicode_range_a in ["Katakana", "Hiragana"]
565
- and unicode_range_b in ["Katakana", "Hiragana"]
566
- ):
567
- if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
568
- return False
569
- if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
570
- return False
571
- if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
572
- return False
573
-
574
- return True
575
-
576
-
577
- @lru_cache(maxsize=2048)
578
- def mess_ratio(
579
- decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
580
- ) -> float:
581
- """
582
- Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
583
- """
584
-
585
- detectors: list[MessDetectorPlugin] = [
586
- md_class() for md_class in MessDetectorPlugin.__subclasses__()
587
- ]
588
-
589
- length: int = len(decoded_sequence) + 1
590
-
591
- mean_mess_ratio: float = 0.0
592
-
593
- if length < 512:
594
- intermediary_mean_mess_ratio_calc: int = 32
595
- elif length <= 1024:
596
- intermediary_mean_mess_ratio_calc = 64
597
- else:
598
- intermediary_mean_mess_ratio_calc = 128
599
-
600
- for character, index in zip(decoded_sequence + "\n", range(length)):
601
- for detector in detectors:
602
- if detector.eligible(character):
603
- detector.feed(character)
604
-
605
- if (
606
- index > 0 and index % intermediary_mean_mess_ratio_calc == 0
607
- ) or index == length - 1:
608
- mean_mess_ratio = sum(dt.ratio for dt in detectors)
609
-
610
- if mean_mess_ratio >= maximum_threshold:
611
- break
612
-
613
- if debug:
614
- logger = getLogger("charset_normalizer")
615
-
616
- logger.log(
617
- TRACE,
618
- "Mess-detector extended-analysis start. "
619
- f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
620
- f"maximum_threshold={maximum_threshold}",
621
- )
622
-
623
- if len(decoded_sequence) > 16:
624
- logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
625
- logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
626
-
627
- for dt in detectors:
628
- logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
629
-
630
- return round(mean_mess_ratio, 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/models.py DELETED
@@ -1,360 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from encodings.aliases import aliases
4
- from hashlib import sha256
5
- from json import dumps
6
- from re import sub
7
- from typing import Any, Iterator, List, Tuple
8
-
9
- from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10
- from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
-
12
-
13
- class CharsetMatch:
14
- def __init__(
15
- self,
16
- payload: bytes,
17
- guessed_encoding: str,
18
- mean_mess_ratio: float,
19
- has_sig_or_bom: bool,
20
- languages: CoherenceMatches,
21
- decoded_payload: str | None = None,
22
- preemptive_declaration: str | None = None,
23
- ):
24
- self._payload: bytes = payload
25
-
26
- self._encoding: str = guessed_encoding
27
- self._mean_mess_ratio: float = mean_mess_ratio
28
- self._languages: CoherenceMatches = languages
29
- self._has_sig_or_bom: bool = has_sig_or_bom
30
- self._unicode_ranges: list[str] | None = None
31
-
32
- self._leaves: list[CharsetMatch] = []
33
- self._mean_coherence_ratio: float = 0.0
34
-
35
- self._output_payload: bytes | None = None
36
- self._output_encoding: str | None = None
37
-
38
- self._string: str | None = decoded_payload
39
-
40
- self._preemptive_declaration: str | None = preemptive_declaration
41
-
42
- def __eq__(self, other: object) -> bool:
43
- if not isinstance(other, CharsetMatch):
44
- if isinstance(other, str):
45
- return iana_name(other) == self.encoding
46
- return False
47
- return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
-
49
- def __lt__(self, other: object) -> bool:
50
- """
51
- Implemented to make sorted available upon CharsetMatches items.
52
- """
53
- if not isinstance(other, CharsetMatch):
54
- raise ValueError
55
-
56
- chaos_difference: float = abs(self.chaos - other.chaos)
57
- coherence_difference: float = abs(self.coherence - other.coherence)
58
-
59
- # Below 1% difference --> Use Coherence
60
- if chaos_difference < 0.01 and coherence_difference > 0.02:
61
- return self.coherence > other.coherence
62
- elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63
- # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64
- # preserve RAM usage!
65
- if len(self._payload) >= TOO_BIG_SEQUENCE:
66
- return self.chaos < other.chaos
67
- return self.multi_byte_usage > other.multi_byte_usage
68
-
69
- return self.chaos < other.chaos
70
-
71
- @property
72
- def multi_byte_usage(self) -> float:
73
- return 1.0 - (len(str(self)) / len(self.raw))
74
-
75
- def __str__(self) -> str:
76
- # Lazy Str Loading
77
- if self._string is None:
78
- self._string = str(self._payload, self._encoding, "strict")
79
- return self._string
80
-
81
- def __repr__(self) -> str:
82
- return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
-
84
- def add_submatch(self, other: CharsetMatch) -> None:
85
- if not isinstance(other, CharsetMatch) or other == self:
86
- raise ValueError(
87
- "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88
- other.__class__
89
- )
90
- )
91
-
92
- other._string = None # Unload RAM usage; dirty trick.
93
- self._leaves.append(other)
94
-
95
- @property
96
- def encoding(self) -> str:
97
- return self._encoding
98
-
99
- @property
100
- def encoding_aliases(self) -> list[str]:
101
- """
102
- Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103
- """
104
- also_known_as: list[str] = []
105
- for u, p in aliases.items():
106
- if self.encoding == u:
107
- also_known_as.append(p)
108
- elif self.encoding == p:
109
- also_known_as.append(u)
110
- return also_known_as
111
-
112
- @property
113
- def bom(self) -> bool:
114
- return self._has_sig_or_bom
115
-
116
- @property
117
- def byte_order_mark(self) -> bool:
118
- return self._has_sig_or_bom
119
-
120
- @property
121
- def languages(self) -> list[str]:
122
- """
123
- Return the complete list of possible languages found in decoded sequence.
124
- Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125
- """
126
- return [e[0] for e in self._languages]
127
-
128
- @property
129
- def language(self) -> str:
130
- """
131
- Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132
- "Unknown".
133
- """
134
- if not self._languages:
135
- # Trying to infer the language based on the given encoding
136
- # Its either English or we should not pronounce ourselves in certain cases.
137
- if "ascii" in self.could_be_from_charset:
138
- return "English"
139
-
140
- # doing it there to avoid circular import
141
- from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
-
143
- languages = (
144
- mb_encoding_languages(self.encoding)
145
- if is_multi_byte_encoding(self.encoding)
146
- else encoding_languages(self.encoding)
147
- )
148
-
149
- if len(languages) == 0 or "Latin Based" in languages:
150
- return "Unknown"
151
-
152
- return languages[0]
153
-
154
- return self._languages[0][0]
155
-
156
- @property
157
- def chaos(self) -> float:
158
- return self._mean_mess_ratio
159
-
160
- @property
161
- def coherence(self) -> float:
162
- if not self._languages:
163
- return 0.0
164
- return self._languages[0][1]
165
-
166
- @property
167
- def percent_chaos(self) -> float:
168
- return round(self.chaos * 100, ndigits=3)
169
-
170
- @property
171
- def percent_coherence(self) -> float:
172
- return round(self.coherence * 100, ndigits=3)
173
-
174
- @property
175
- def raw(self) -> bytes:
176
- """
177
- Original untouched bytes.
178
- """
179
- return self._payload
180
-
181
- @property
182
- def submatch(self) -> list[CharsetMatch]:
183
- return self._leaves
184
-
185
- @property
186
- def has_submatch(self) -> bool:
187
- return len(self._leaves) > 0
188
-
189
- @property
190
- def alphabets(self) -> list[str]:
191
- if self._unicode_ranges is not None:
192
- return self._unicode_ranges
193
- # list detected ranges
194
- detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195
- # filter and sort
196
- self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197
- return self._unicode_ranges
198
-
199
- @property
200
- def could_be_from_charset(self) -> list[str]:
201
- """
202
- The complete list of encoding that output the exact SAME str result and therefore could be the originating
203
- encoding.
204
- This list does include the encoding available in property 'encoding'.
205
- """
206
- return [self._encoding] + [m.encoding for m in self._leaves]
207
-
208
- def output(self, encoding: str = "utf_8") -> bytes:
209
- """
210
- Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211
- Any errors will be simply ignored by the encoder NOT replaced.
212
- """
213
- if self._output_encoding is None or self._output_encoding != encoding:
214
- self._output_encoding = encoding
215
- decoded_string = str(self)
216
- if (
217
- self._preemptive_declaration is not None
218
- and self._preemptive_declaration.lower()
219
- not in ["utf-8", "utf8", "utf_8"]
220
- ):
221
- patched_header = sub(
222
- RE_POSSIBLE_ENCODING_INDICATION,
223
- lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224
- m.groups()[0],
225
- iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226
- ),
227
- decoded_string[:8192],
228
- count=1,
229
- )
230
-
231
- decoded_string = patched_header + decoded_string[8192:]
232
-
233
- self._output_payload = decoded_string.encode(encoding, "replace")
234
-
235
- return self._output_payload # type: ignore
236
-
237
- @property
238
- def fingerprint(self) -> str:
239
- """
240
- Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241
- """
242
- return sha256(self.output()).hexdigest()
243
-
244
-
245
- class CharsetMatches:
246
- """
247
- Container with every CharsetMatch items ordered by default from most probable to the less one.
248
- Act like a list(iterable) but does not implements all related methods.
249
- """
250
-
251
- def __init__(self, results: list[CharsetMatch] | None = None):
252
- self._results: list[CharsetMatch] = sorted(results) if results else []
253
-
254
- def __iter__(self) -> Iterator[CharsetMatch]:
255
- yield from self._results
256
-
257
- def __getitem__(self, item: int | str) -> CharsetMatch:
258
- """
259
- Retrieve a single item either by its position or encoding name (alias may be used here).
260
- Raise KeyError upon invalid index or encoding not present in results.
261
- """
262
- if isinstance(item, int):
263
- return self._results[item]
264
- if isinstance(item, str):
265
- item = iana_name(item, False)
266
- for result in self._results:
267
- if item in result.could_be_from_charset:
268
- return result
269
- raise KeyError
270
-
271
- def __len__(self) -> int:
272
- return len(self._results)
273
-
274
- def __bool__(self) -> bool:
275
- return len(self._results) > 0
276
-
277
- def append(self, item: CharsetMatch) -> None:
278
- """
279
- Insert a single match. Will be inserted accordingly to preserve sort.
280
- Can be inserted as a submatch.
281
- """
282
- if not isinstance(item, CharsetMatch):
283
- raise ValueError(
284
- "Cannot append instance '{}' to CharsetMatches".format(
285
- str(item.__class__)
286
- )
287
- )
288
- # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289
- if len(item.raw) < TOO_BIG_SEQUENCE:
290
- for match in self._results:
291
- if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292
- match.add_submatch(item)
293
- return
294
- self._results.append(item)
295
- self._results = sorted(self._results)
296
-
297
- def best(self) -> CharsetMatch | None:
298
- """
299
- Simply return the first match. Strict equivalent to matches[0].
300
- """
301
- if not self._results:
302
- return None
303
- return self._results[0]
304
-
305
- def first(self) -> CharsetMatch | None:
306
- """
307
- Redundant method, call the method best(). Kept for BC reasons.
308
- """
309
- return self.best()
310
-
311
-
312
- CoherenceMatch = Tuple[str, float]
313
- CoherenceMatches = List[CoherenceMatch]
314
-
315
-
316
- class CliDetectionResult:
317
- def __init__(
318
- self,
319
- path: str,
320
- encoding: str | None,
321
- encoding_aliases: list[str],
322
- alternative_encodings: list[str],
323
- language: str,
324
- alphabets: list[str],
325
- has_sig_or_bom: bool,
326
- chaos: float,
327
- coherence: float,
328
- unicode_path: str | None,
329
- is_preferred: bool,
330
- ):
331
- self.path: str = path
332
- self.unicode_path: str | None = unicode_path
333
- self.encoding: str | None = encoding
334
- self.encoding_aliases: list[str] = encoding_aliases
335
- self.alternative_encodings: list[str] = alternative_encodings
336
- self.language: str = language
337
- self.alphabets: list[str] = alphabets
338
- self.has_sig_or_bom: bool = has_sig_or_bom
339
- self.chaos: float = chaos
340
- self.coherence: float = coherence
341
- self.is_preferred: bool = is_preferred
342
-
343
- @property
344
- def __dict__(self) -> dict[str, Any]: # type: ignore
345
- return {
346
- "path": self.path,
347
- "encoding": self.encoding,
348
- "encoding_aliases": self.encoding_aliases,
349
- "alternative_encodings": self.alternative_encodings,
350
- "language": self.language,
351
- "alphabets": self.alphabets,
352
- "has_sig_or_bom": self.has_sig_or_bom,
353
- "chaos": self.chaos,
354
- "coherence": self.coherence,
355
- "unicode_path": self.unicode_path,
356
- "is_preferred": self.is_preferred,
357
- }
358
-
359
- def to_json(self) -> str:
360
- return dumps(self.__dict__, ensure_ascii=True, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/py.typed DELETED
File without changes
env/Lib/site-packages/charset_normalizer/utils.py DELETED
@@ -1,408 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import importlib
4
- import logging
5
- import unicodedata
6
- from codecs import IncrementalDecoder
7
- from encodings.aliases import aliases
8
- from functools import lru_cache
9
- from re import findall
10
- from typing import Generator
11
-
12
- from _multibytecodec import ( # type: ignore[import-not-found,import]
13
- MultibyteIncrementalDecoder,
14
- )
15
-
16
- from .constant import (
17
- ENCODING_MARKS,
18
- IANA_SUPPORTED_SIMILAR,
19
- RE_POSSIBLE_ENCODING_INDICATION,
20
- UNICODE_RANGES_COMBINED,
21
- UNICODE_SECONDARY_RANGE_KEYWORD,
22
- UTF8_MAXIMAL_ALLOCATION,
23
- )
24
-
25
-
26
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
27
- def is_accentuated(character: str) -> bool:
28
- try:
29
- description: str = unicodedata.name(character)
30
- except ValueError: # Defensive: unicode database outdated?
31
- return False
32
- return (
33
- "WITH GRAVE" in description
34
- or "WITH ACUTE" in description
35
- or "WITH CEDILLA" in description
36
- or "WITH DIAERESIS" in description
37
- or "WITH CIRCUMFLEX" in description
38
- or "WITH TILDE" in description
39
- or "WITH MACRON" in description
40
- or "WITH RING ABOVE" in description
41
- )
42
-
43
-
44
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
45
- def remove_accent(character: str) -> str:
46
- decomposed: str = unicodedata.decomposition(character)
47
- if not decomposed:
48
- return character
49
-
50
- codes: list[str] = decomposed.split(" ")
51
-
52
- return chr(int(codes[0], 16))
53
-
54
-
55
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
56
- def unicode_range(character: str) -> str | None:
57
- """
58
- Retrieve the Unicode range official name from a single character.
59
- """
60
- character_ord: int = ord(character)
61
-
62
- for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
63
- if character_ord in ord_range:
64
- return range_name
65
-
66
- return None
67
-
68
-
69
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
70
- def is_latin(character: str) -> bool:
71
- try:
72
- description: str = unicodedata.name(character)
73
- except ValueError: # Defensive: unicode database outdated?
74
- return False
75
- return "LATIN" in description
76
-
77
-
78
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
79
- def is_punctuation(character: str) -> bool:
80
- character_category: str = unicodedata.category(character)
81
-
82
- if "P" in character_category:
83
- return True
84
-
85
- character_range: str | None = unicode_range(character)
86
-
87
- if character_range is None:
88
- return False
89
-
90
- return "Punctuation" in character_range
91
-
92
-
93
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
94
- def is_symbol(character: str) -> bool:
95
- character_category: str = unicodedata.category(character)
96
-
97
- if "S" in character_category or "N" in character_category:
98
- return True
99
-
100
- character_range: str | None = unicode_range(character)
101
-
102
- if character_range is None:
103
- return False
104
-
105
- return "Forms" in character_range and character_category != "Lo"
106
-
107
-
108
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
109
- def is_emoticon(character: str) -> bool:
110
- character_range: str | None = unicode_range(character)
111
-
112
- if character_range is None:
113
- return False
114
-
115
- return "Emoticons" in character_range or "Pictographs" in character_range
116
-
117
-
118
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
119
- def is_separator(character: str) -> bool:
120
- if character.isspace() or character in {"|", "+", "<", ">"}:
121
- return True
122
-
123
- character_category: str = unicodedata.category(character)
124
-
125
- return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
126
-
127
-
128
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
129
- def is_case_variable(character: str) -> bool:
130
- return character.islower() != character.isupper()
131
-
132
-
133
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
134
- def is_cjk(character: str) -> bool:
135
- try:
136
- character_name = unicodedata.name(character)
137
- except ValueError: # Defensive: unicode database outdated?
138
- return False
139
-
140
- return "CJK" in character_name
141
-
142
-
143
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
144
- def is_hiragana(character: str) -> bool:
145
- try:
146
- character_name = unicodedata.name(character)
147
- except ValueError: # Defensive: unicode database outdated?
148
- return False
149
-
150
- return "HIRAGANA" in character_name
151
-
152
-
153
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
154
- def is_katakana(character: str) -> bool:
155
- try:
156
- character_name = unicodedata.name(character)
157
- except ValueError: # Defensive: unicode database outdated?
158
- return False
159
-
160
- return "KATAKANA" in character_name
161
-
162
-
163
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
164
- def is_hangul(character: str) -> bool:
165
- try:
166
- character_name = unicodedata.name(character)
167
- except ValueError: # Defensive: unicode database outdated?
168
- return False
169
-
170
- return "HANGUL" in character_name
171
-
172
-
173
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
174
- def is_thai(character: str) -> bool:
175
- try:
176
- character_name = unicodedata.name(character)
177
- except ValueError: # Defensive: unicode database outdated?
178
- return False
179
-
180
- return "THAI" in character_name
181
-
182
-
183
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
184
- def is_arabic(character: str) -> bool:
185
- try:
186
- character_name = unicodedata.name(character)
187
- except ValueError: # Defensive: unicode database outdated?
188
- return False
189
-
190
- return "ARABIC" in character_name
191
-
192
-
193
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
194
- def is_arabic_isolated_form(character: str) -> bool:
195
- try:
196
- character_name = unicodedata.name(character)
197
- except ValueError: # Defensive: unicode database outdated?
198
- return False
199
-
200
- return "ARABIC" in character_name and "ISOLATED FORM" in character_name
201
-
202
-
203
- @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
204
- def is_unicode_range_secondary(range_name: str) -> bool:
205
- return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
206
-
207
-
208
- @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
209
- def is_unprintable(character: str) -> bool:
210
- return (
211
- character.isspace() is False # includes \n \t \r \v
212
- and character.isprintable() is False
213
- and character != "\x1a" # Why? Its the ASCII substitute character.
214
- and character != "\ufeff" # bug discovered in Python,
215
- # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
216
- )
217
-
218
-
219
- def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
220
- """
221
- Extract using ASCII-only decoder any specified encoding in the first n-bytes.
222
- """
223
- if not isinstance(sequence, bytes):
224
- raise TypeError
225
-
226
- seq_len: int = len(sequence)
227
-
228
- results: list[str] = findall(
229
- RE_POSSIBLE_ENCODING_INDICATION,
230
- sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
231
- )
232
-
233
- if len(results) == 0:
234
- return None
235
-
236
- for specified_encoding in results:
237
- specified_encoding = specified_encoding.lower().replace("-", "_")
238
-
239
- encoding_alias: str
240
- encoding_iana: str
241
-
242
- for encoding_alias, encoding_iana in aliases.items():
243
- if encoding_alias == specified_encoding:
244
- return encoding_iana
245
- if encoding_iana == specified_encoding:
246
- return encoding_iana
247
-
248
- return None
249
-
250
-
251
- @lru_cache(maxsize=128)
252
- def is_multi_byte_encoding(name: str) -> bool:
253
- """
254
- Verify is a specific encoding is a multi byte one based on it IANA name
255
- """
256
- return name in {
257
- "utf_8",
258
- "utf_8_sig",
259
- "utf_16",
260
- "utf_16_be",
261
- "utf_16_le",
262
- "utf_32",
263
- "utf_32_le",
264
- "utf_32_be",
265
- "utf_7",
266
- } or issubclass(
267
- importlib.import_module(f"encodings.{name}").IncrementalDecoder,
268
- MultibyteIncrementalDecoder,
269
- )
270
-
271
-
272
- def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
273
- """
274
- Identify and extract SIG/BOM in given sequence.
275
- """
276
-
277
- for iana_encoding in ENCODING_MARKS:
278
- marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
279
-
280
- if isinstance(marks, bytes):
281
- marks = [marks]
282
-
283
- for mark in marks:
284
- if sequence.startswith(mark):
285
- return iana_encoding, mark
286
-
287
- return None, b""
288
-
289
-
290
- def should_strip_sig_or_bom(iana_encoding: str) -> bool:
291
- return iana_encoding not in {"utf_16", "utf_32"}
292
-
293
-
294
- def iana_name(cp_name: str, strict: bool = True) -> str:
295
- """Returns the Python normalized encoding name (Not the IANA official name)."""
296
- cp_name = cp_name.lower().replace("-", "_")
297
-
298
- encoding_alias: str
299
- encoding_iana: str
300
-
301
- for encoding_alias, encoding_iana in aliases.items():
302
- if cp_name in [encoding_alias, encoding_iana]:
303
- return encoding_iana
304
-
305
- if strict:
306
- raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
307
-
308
- return cp_name
309
-
310
-
311
- def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
312
- if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
313
- return 0.0
314
-
315
- decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
316
- decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
317
-
318
- id_a: IncrementalDecoder = decoder_a(errors="ignore")
319
- id_b: IncrementalDecoder = decoder_b(errors="ignore")
320
-
321
- character_match_count: int = 0
322
-
323
- for i in range(255):
324
- to_be_decoded: bytes = bytes([i])
325
- if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
326
- character_match_count += 1
327
-
328
- return character_match_count / 254
329
-
330
-
331
- def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
332
- """
333
- Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
334
- the function cp_similarity.
335
- """
336
- return (
337
- iana_name_a in IANA_SUPPORTED_SIMILAR
338
- and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
339
- )
340
-
341
-
342
- def set_logging_handler(
343
- name: str = "charset_normalizer",
344
- level: int = logging.INFO,
345
- format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
346
- ) -> None:
347
- logger = logging.getLogger(name)
348
- logger.setLevel(level)
349
-
350
- handler = logging.StreamHandler()
351
- handler.setFormatter(logging.Formatter(format_string))
352
- logger.addHandler(handler)
353
-
354
-
355
- def cut_sequence_chunks(
356
- sequences: bytes,
357
- encoding_iana: str,
358
- offsets: range,
359
- chunk_size: int,
360
- bom_or_sig_available: bool,
361
- strip_sig_or_bom: bool,
362
- sig_payload: bytes,
363
- is_multi_byte_decoder: bool,
364
- decoded_payload: str | None = None,
365
- ) -> Generator[str, None, None]:
366
- if decoded_payload and is_multi_byte_decoder is False:
367
- for i in offsets:
368
- chunk = decoded_payload[i : i + chunk_size]
369
- if not chunk:
370
- break
371
- yield chunk
372
- else:
373
- for i in offsets:
374
- chunk_end = i + chunk_size
375
- if chunk_end > len(sequences) + 8:
376
- continue
377
-
378
- cut_sequence = sequences[i : i + chunk_size]
379
-
380
- if bom_or_sig_available and strip_sig_or_bom is False:
381
- cut_sequence = sig_payload + cut_sequence
382
-
383
- chunk = cut_sequence.decode(
384
- encoding_iana,
385
- errors="ignore" if is_multi_byte_decoder else "strict",
386
- )
387
-
388
- # multi-byte bad cutting detector and adjustment
389
- # not the cleanest way to perform that fix but clever enough for now.
390
- if is_multi_byte_decoder and i > 0:
391
- chunk_partial_size_chk: int = min(chunk_size, 16)
392
-
393
- if (
394
- decoded_payload
395
- and chunk[:chunk_partial_size_chk] not in decoded_payload
396
- ):
397
- for j in range(i, i - 4, -1):
398
- cut_sequence = sequences[j:chunk_end]
399
-
400
- if bom_or_sig_available and strip_sig_or_bom is False:
401
- cut_sequence = sig_payload + cut_sequence
402
-
403
- chunk = cut_sequence.decode(encoding_iana, errors="ignore")
404
-
405
- if chunk[:chunk_partial_size_chk] in decoded_payload:
406
- break
407
-
408
- yield chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/charset_normalizer/version.py DELETED
@@ -1,8 +0,0 @@
1
- """
2
- Expose version
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- __version__ = "3.4.1"
8
- VERSION = __version__.split(".")
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER DELETED
@@ -1 +0,0 @@
1
- pip
 
 
env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA DELETED
@@ -1,441 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: colorama
3
- Version: 0.4.6
4
- Summary: Cross-platform colored terminal text.
5
- Project-URL: Homepage, https://github.com/tartley/colorama
6
- Author-email: Jonathan Hartley <[email protected]>
7
- License-File: LICENSE.txt
8
- Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
9
- Classifier: Development Status :: 5 - Production/Stable
10
- Classifier: Environment :: Console
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: BSD License
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 2
16
- Classifier: Programming Language :: Python :: 2.7
17
- Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.7
19
- Classifier: Programming Language :: Python :: 3.8
20
- Classifier: Programming Language :: Python :: 3.9
21
- Classifier: Programming Language :: Python :: 3.10
22
- Classifier: Programming Language :: Python :: Implementation :: CPython
23
- Classifier: Programming Language :: Python :: Implementation :: PyPy
24
- Classifier: Topic :: Terminals
25
- Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
26
- Description-Content-Type: text/x-rst
27
-
28
- .. image:: https://img.shields.io/pypi/v/colorama.svg
29
- :target: https://pypi.org/project/colorama/
30
- :alt: Latest Version
31
-
32
- .. image:: https://img.shields.io/pypi/pyversions/colorama.svg
33
- :target: https://pypi.org/project/colorama/
34
- :alt: Supported Python versions
35
-
36
- .. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
37
- :target: https://github.com/tartley/colorama/actions/workflows/test.yml
38
- :alt: Build Status
39
-
40
- Colorama
41
- ========
42
-
43
- Makes ANSI escape character sequences (for producing colored terminal text and
44
- cursor positioning) work under MS Windows.
45
-
46
- .. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
47
- :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama&currency_code=USD
48
- :alt: Donate with Paypal
49
-
50
- `PyPI for releases <https://pypi.org/project/colorama/>`_ |
51
- `Github for source <https://github.com/tartley/colorama>`_ |
52
- `Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
53
-
54
- If you find Colorama useful, please |donate| to the authors. Thank you!
55
-
56
- Installation
57
- ------------
58
-
59
- Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
60
-
61
- No requirements other than the standard library.
62
-
63
- .. code-block:: bash
64
-
65
- pip install colorama
66
- # or
67
- conda install -c anaconda colorama
68
-
69
- Description
70
- -----------
71
-
72
- ANSI escape character sequences have long been used to produce colored terminal
73
- text and cursor positioning on Unix and Macs. Colorama makes this work on
74
- Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
75
- would appear as gobbledygook in the output), and converting them into the
76
- appropriate win32 calls to modify the state of the terminal. On other platforms,
77
- Colorama does nothing.
78
-
79
- This has the upshot of providing a simple cross-platform API for printing
80
- colored terminal text from Python, and has the happy side-effect that existing
81
- applications or libraries which use ANSI sequences to produce colored output on
82
- Linux or Macs can now also work on Windows, simply by calling
83
- ``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
84
- (all versions, but may have other side-effects – see below).
85
-
86
- An alternative approach is to install ``ansi.sys`` on Windows machines, which
87
- provides the same behaviour for all applications running in terminals. Colorama
88
- is intended for situations where that isn't easy (e.g., maybe your app doesn't
89
- have an installer.)
90
-
91
- Demo scripts in the source code repository print some colored text using
92
- ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
93
- handling, versus on Windows Command-Prompt using Colorama:
94
-
95
- .. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
96
- :width: 661
97
- :height: 357
98
- :alt: ANSI sequences on Ubuntu under gnome-terminal.
99
-
100
- .. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
101
- :width: 668
102
- :height: 325
103
- :alt: Same ANSI sequences on Windows, using Colorama.
104
-
105
- These screenshots show that, on Windows, Colorama does not support ANSI 'dim
106
- text'; it looks the same as 'normal text'.
107
-
108
- Usage
109
- -----
110
-
111
- Initialisation
112
- ..............
113
-
114
- If the only thing you want from Colorama is to get ANSI escapes to work on
115
- Windows, then run:
116
-
117
- .. code-block:: python
118
-
119
- from colorama import just_fix_windows_console
120
- just_fix_windows_console()
121
-
122
- If you're on a recent version of Windows 10 or better, and your stdout/stderr
123
- are pointing to a Windows console, then this will flip the magic configuration
124
- switch to enable Windows' built-in ANSI support.
125
-
126
- If you're on an older version of Windows, and your stdout/stderr are pointing to
127
- a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
128
- magic file object that intercepts ANSI escape sequences and issues the
129
- appropriate Win32 calls to emulate them.
130
-
131
- In all other circumstances, it does nothing whatsoever. Basically the idea is
132
- that this makes Windows act like Unix with respect to ANSI escape handling.
133
-
134
- It's safe to call this function multiple times. It's safe to call this function
135
- on non-Windows platforms, but it won't do anything. It's safe to call this
136
- function when one or both of your stdout/stderr are redirected to a file – it
137
- won't do anything to those streams.
138
-
139
- Alternatively, you can use the older interface with more features (but also more
140
- potential footguns):
141
-
142
- .. code-block:: python
143
-
144
- from colorama import init
145
- init()
146
-
147
- This does the same thing as ``just_fix_windows_console``, except for the
148
- following differences:
149
-
150
- - It's not safe to call ``init`` multiple times; you can end up with multiple
151
- layers of wrapping and broken ANSI support.
152
-
153
- - Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
154
- and if it thinks they don't, then it will wrap ``sys.stdout`` and
155
- ``sys.stderr`` in a magic file object that strips out ANSI escape sequences
156
- before printing them. This happens on all platforms, and can be convenient if
157
- you want to write your code to emit ANSI escape sequences unconditionally, and
158
- let Colorama decide whether they should actually be output. But note that
159
- Colorama's heuristic is not particularly clever.
160
-
161
- - ``init`` also accepts explicit keyword args to enable/disable various
162
- functionality – see below.
163
-
164
- To stop using Colorama before your program exits, simply call ``deinit()``.
165
- This will restore ``stdout`` and ``stderr`` to their original values, so that
166
- Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
167
- cheaper than calling ``init()`` again (but does the same thing).
168
-
169
- Most users should depend on ``colorama >= 0.4.6``, and use
170
- ``just_fix_windows_console``. The old ``init`` interface will be supported
171
- indefinitely for backwards compatibility, but we don't plan to fix any issues
172
- with it, also for backwards compatibility.
173
-
174
- Colored Output
175
- ..............
176
-
177
- Cross-platform printing of colored text can then be done using Colorama's
178
- constant shorthand for ANSI escape sequences. These are deliberately
179
- rudimentary, see below.
180
-
181
- .. code-block:: python
182
-
183
- from colorama import Fore, Back, Style
184
- print(Fore.RED + 'some red text')
185
- print(Back.GREEN + 'and with a green background')
186
- print(Style.DIM + 'and in dim text')
187
- print(Style.RESET_ALL)
188
- print('back to normal now')
189
-
190
- ...or simply by manually printing ANSI sequences from your own code:
191
-
192
- .. code-block:: python
193
-
194
- print('\033[31m' + 'some red text')
195
- print('\033[39m') # and reset to default color
196
-
197
- ...or, Colorama can be used in conjunction with existing ANSI libraries
198
- such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
199
- the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
200
- or the incredible `_Rich <https://pypi.org/project/rich/>`_.
201
-
202
- If you wish Colorama's Fore, Back and Style constants were more capable,
203
- then consider using one of the above highly capable libraries to generate
204
- colors, etc, and use Colorama just for its primary purpose: to convert
205
- those ANSI sequences to also work on Windows:
206
-
207
- SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
208
- We are only interested in converting ANSI codes to win32 API calls, not
209
- shortcuts like the above to generate ANSI characters.
210
-
211
- .. code-block:: python
212
-
213
- from colorama import just_fix_windows_console
214
- from termcolor import colored
215
-
216
- # use Colorama to make Termcolor work on Windows too
217
- just_fix_windows_console()
218
-
219
- # then use Termcolor for all colored text output
220
- print(colored('Hello, World!', 'green', 'on_red'))
221
-
222
- Available formatting constants are::
223
-
224
- Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
225
- Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
226
- Style: DIM, NORMAL, BRIGHT, RESET_ALL
227
-
228
- ``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
229
- perform this reset automatically on program exit.
230
-
231
- These are fairly well supported, but not part of the standard::
232
-
233
- Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
234
- Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
235
-
236
- Cursor Positioning
237
- ..................
238
-
239
- ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
240
- an example of how to generate them.
241
-
242
- Init Keyword Args
243
- .................
244
-
245
- ``init()`` accepts some ``**kwargs`` to override default behaviour.
246
-
247
- init(autoreset=False):
248
- If you find yourself repeatedly sending reset sequences to turn off color
249
- changes at the end of every print, then ``init(autoreset=True)`` will
250
- automate that:
251
-
252
- .. code-block:: python
253
-
254
- from colorama import init
255
- init(autoreset=True)
256
- print(Fore.RED + 'some red text')
257
- print('automatically back to default color again')
258
-
259
- init(strip=None):
260
- Pass ``True`` or ``False`` to override whether ANSI codes should be
261
- stripped from the output. The default behaviour is to strip if on Windows
262
- or if output is redirected (not a tty).
263
-
264
- init(convert=None):
265
- Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
266
- output into win32 calls. The default behaviour is to convert if on Windows
267
- and output is to a tty (terminal).
268
-
269
- init(wrap=True):
270
- On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
271
- with proxy objects, which override the ``.write()`` method to do their work.
272
- If this wrapping causes you problems, then this can be disabled by passing
273
- ``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
274
- ``strip`` or ``convert`` are True.
275
-
276
- When wrapping is disabled, colored printing on non-Windows platforms will
277
- continue to work as normal. To do cross-platform colored output, you can
278
- use Colorama's ``AnsiToWin32`` proxy directly:
279
-
280
- .. code-block:: python
281
-
282
- import sys
283
- from colorama import init, AnsiToWin32
284
- init(wrap=False)
285
- stream = AnsiToWin32(sys.stderr).stream
286
-
287
- # Python 2
288
- print >>stream, Fore.BLUE + 'blue text on stderr'
289
-
290
- # Python 3
291
- print(Fore.BLUE + 'blue text on stderr', file=stream)
292
-
293
- Recognised ANSI Sequences
294
- .........................
295
-
296
- ANSI sequences generally take the form::
297
-
298
- ESC [ <param> ; <param> ... <command>
299
-
300
- Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
301
- more params are passed to a ``<command>``. If no params are passed, it is
302
- generally synonymous with passing a single zero. No spaces exist in the
303
- sequence; they have been inserted here simply to read more easily.
304
-
305
- The only ANSI sequences that Colorama converts into win32 calls are::
306
-
307
- ESC [ 0 m # reset all (colors and brightness)
308
- ESC [ 1 m # bright
309
- ESC [ 2 m # dim (looks same as normal brightness)
310
- ESC [ 22 m # normal brightness
311
-
312
- # FOREGROUND:
313
- ESC [ 30 m # black
314
- ESC [ 31 m # red
315
- ESC [ 32 m # green
316
- ESC [ 33 m # yellow
317
- ESC [ 34 m # blue
318
- ESC [ 35 m # magenta
319
- ESC [ 36 m # cyan
320
- ESC [ 37 m # white
321
- ESC [ 39 m # reset
322
-
323
- # BACKGROUND
324
- ESC [ 40 m # black
325
- ESC [ 41 m # red
326
- ESC [ 42 m # green
327
- ESC [ 43 m # yellow
328
- ESC [ 44 m # blue
329
- ESC [ 45 m # magenta
330
- ESC [ 46 m # cyan
331
- ESC [ 47 m # white
332
- ESC [ 49 m # reset
333
-
334
- # cursor positioning
335
- ESC [ y;x H # position cursor at x across, y down
336
- ESC [ y;x f # position cursor at x across, y down
337
- ESC [ n A # move cursor n lines up
338
- ESC [ n B # move cursor n lines down
339
- ESC [ n C # move cursor n characters forward
340
- ESC [ n D # move cursor n characters backward
341
-
342
- # clear the screen
343
- ESC [ mode J # clear the screen
344
-
345
- # clear the line
346
- ESC [ mode K # clear the line
347
-
348
- Multiple numeric params to the ``'m'`` command can be combined into a single
349
- sequence::
350
-
351
- ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
352
-
353
- All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
354
- are silently stripped from the output on Windows.
355
-
356
- Any other form of ANSI sequence, such as single-character codes or alternative
357
- initial characters, are not recognised or stripped. It would be cool to add
358
- them though. Let me know if it would be useful for you, via the Issues on
359
- GitHub.
360
-
361
- Status & Known Problems
362
- -----------------------
363
-
364
- I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
365
- (gnome-terminal, xterm), and OS X.
366
-
367
- Some valid ANSI sequences aren't recognised.
368
-
369
- If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
370
- explanation there of why we do not want PRs that allow Colorama to generate new
371
- types of ANSI codes.
372
-
373
- See outstanding issues and wish-list:
374
- https://github.com/tartley/colorama/issues
375
-
376
- If anything doesn't work for you, or doesn't do what you expected or hoped for,
377
- I'd love to hear about it on that issues list, would be delighted by patches,
378
- and would be happy to grant commit access to anyone who submits a working patch
379
- or two.
380
-
381
- .. _README-hacking.md: README-hacking.md
382
-
383
- License
384
- -------
385
-
386
- Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
387
- LICENSE file.
388
-
389
- Professional support
390
- --------------------
391
-
392
- .. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
393
- :alt: Tidelift
394
- :target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
395
-
396
- .. list-table::
397
- :widths: 10 100
398
-
399
- * - |tideliftlogo|
400
- - Professional support for colorama is available as part of the
401
- `Tidelift Subscription`_.
402
- Tidelift gives software development teams a single source for purchasing
403
- and maintaining their software, with professional grade assurances from
404
- the experts who know it best, while seamlessly integrating with existing
405
- tools.
406
-
407
- .. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
408
-
409
- Thanks
410
- ------
411
-
412
- See the CHANGELOG for more thanks!
413
-
414
- * Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
415
- * Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
416
- providing a solution to issue #7's setuptools/distutils debate,
417
- and other fixes.
418
- * User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
419
- * Matthew McCormick for politely pointing out a longstanding crash on non-Win.
420
- * Ben Hoyt, for a magnificent fix under 64-bit Windows.
421
- * Jesse at Empty Square for submitting a fix for examples in the README.
422
- * User 'jamessp', an observant documentation fix for cursor positioning.
423
- * User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
424
- fix.
425
- * Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
426
- * Daniel Griffith for multiple fabulous patches.
427
- * Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
428
- output.
429
- * Roger Binns, for many suggestions, valuable feedback, & bug reports.
430
- * Tim Golden for thought and much appreciated feedback on the initial idea.
431
- * User 'Zearin' for updates to the README file.
432
- * John Szakmeister for adding support for light colors
433
- * Charles Merriam for adding documentation to demos
434
- * Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
435
- * Florian Bruhin for a fix when stdout or stderr are None
436
- * Thomas Weininger for fixing ValueError on Windows
437
- * Remi Rampin for better Github integration and fixes to the README file
438
- * Simeon Visser for closing a file handle using 'with' and updating classifiers
439
- to include Python 3.3 and 3.4
440
- * Andy Neff for fixing RESET of LIGHT_EX colors.
441
- * Jonathan Hartley for the initial idea and implementation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD DELETED
@@ -1,31 +0,0 @@
1
- colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
- colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
3
- colorama-0.4.6.dist-info/RECORD,,
4
- colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
5
- colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
6
- colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
7
- colorama/__pycache__/__init__.cpython-312.pyc,,
8
- colorama/__pycache__/ansi.cpython-312.pyc,,
9
- colorama/__pycache__/ansitowin32.cpython-312.pyc,,
10
- colorama/__pycache__/initialise.cpython-312.pyc,,
11
- colorama/__pycache__/win32.cpython-312.pyc,,
12
- colorama/__pycache__/winterm.cpython-312.pyc,,
13
- colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
14
- colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
15
- colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
16
- colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
17
- colorama/tests/__pycache__/__init__.cpython-312.pyc,,
18
- colorama/tests/__pycache__/ansi_test.cpython-312.pyc,,
19
- colorama/tests/__pycache__/ansitowin32_test.cpython-312.pyc,,
20
- colorama/tests/__pycache__/initialise_test.cpython-312.pyc,,
21
- colorama/tests/__pycache__/isatty_test.cpython-312.pyc,,
22
- colorama/tests/__pycache__/utils.cpython-312.pyc,,
23
- colorama/tests/__pycache__/winterm_test.cpython-312.pyc,,
24
- colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
25
- colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
26
- colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
27
- colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
28
- colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
29
- colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
30
- colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
31
- colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL DELETED
@@ -1,5 +0,0 @@
1
- Wheel-Version: 1.0
2
- Generator: hatchling 1.11.1
3
- Root-Is-Purelib: true
4
- Tag: py2-none-any
5
- Tag: py3-none-any
 
 
 
 
 
 
env/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt DELETED
@@ -1,27 +0,0 @@
1
- Copyright (c) 2010 Jonathan Hartley
2
- All rights reserved.
3
-
4
- Redistribution and use in source and binary forms, with or without
5
- modification, are permitted provided that the following conditions are met:
6
-
7
- * Redistributions of source code must retain the above copyright notice, this
8
- list of conditions and the following disclaimer.
9
-
10
- * Redistributions in binary form must reproduce the above copyright notice,
11
- this list of conditions and the following disclaimer in the documentation
12
- and/or other materials provided with the distribution.
13
-
14
- * Neither the name of the copyright holders, nor those of its contributors
15
- may be used to endorse or promote products derived from this software without
16
- specific prior written permission.
17
-
18
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/__init__.py DELETED
@@ -1,7 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
3
- from .ansi import Fore, Back, Style, Cursor
4
- from .ansitowin32 import AnsiToWin32
5
-
6
- __version__ = '0.4.6'
7
-
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/ansi.py DELETED
@@ -1,102 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- '''
3
- This module generates ANSI character codes to printing colors to terminals.
4
- See: http://en.wikipedia.org/wiki/ANSI_escape_code
5
- '''
6
-
7
- CSI = '\033['
8
- OSC = '\033]'
9
- BEL = '\a'
10
-
11
-
12
- def code_to_chars(code):
13
- return CSI + str(code) + 'm'
14
-
15
- def set_title(title):
16
- return OSC + '2;' + title + BEL
17
-
18
- def clear_screen(mode=2):
19
- return CSI + str(mode) + 'J'
20
-
21
- def clear_line(mode=2):
22
- return CSI + str(mode) + 'K'
23
-
24
-
25
- class AnsiCodes(object):
26
- def __init__(self):
27
- # the subclasses declare class attributes which are numbers.
28
- # Upon instantiation we define instance attributes, which are the same
29
- # as the class attributes but wrapped with the ANSI escape sequence
30
- for name in dir(self):
31
- if not name.startswith('_'):
32
- value = getattr(self, name)
33
- setattr(self, name, code_to_chars(value))
34
-
35
-
36
- class AnsiCursor(object):
37
- def UP(self, n=1):
38
- return CSI + str(n) + 'A'
39
- def DOWN(self, n=1):
40
- return CSI + str(n) + 'B'
41
- def FORWARD(self, n=1):
42
- return CSI + str(n) + 'C'
43
- def BACK(self, n=1):
44
- return CSI + str(n) + 'D'
45
- def POS(self, x=1, y=1):
46
- return CSI + str(y) + ';' + str(x) + 'H'
47
-
48
-
49
- class AnsiFore(AnsiCodes):
50
- BLACK = 30
51
- RED = 31
52
- GREEN = 32
53
- YELLOW = 33
54
- BLUE = 34
55
- MAGENTA = 35
56
- CYAN = 36
57
- WHITE = 37
58
- RESET = 39
59
-
60
- # These are fairly well supported, but not part of the standard.
61
- LIGHTBLACK_EX = 90
62
- LIGHTRED_EX = 91
63
- LIGHTGREEN_EX = 92
64
- LIGHTYELLOW_EX = 93
65
- LIGHTBLUE_EX = 94
66
- LIGHTMAGENTA_EX = 95
67
- LIGHTCYAN_EX = 96
68
- LIGHTWHITE_EX = 97
69
-
70
-
71
- class AnsiBack(AnsiCodes):
72
- BLACK = 40
73
- RED = 41
74
- GREEN = 42
75
- YELLOW = 43
76
- BLUE = 44
77
- MAGENTA = 45
78
- CYAN = 46
79
- WHITE = 47
80
- RESET = 49
81
-
82
- # These are fairly well supported, but not part of the standard.
83
- LIGHTBLACK_EX = 100
84
- LIGHTRED_EX = 101
85
- LIGHTGREEN_EX = 102
86
- LIGHTYELLOW_EX = 103
87
- LIGHTBLUE_EX = 104
88
- LIGHTMAGENTA_EX = 105
89
- LIGHTCYAN_EX = 106
90
- LIGHTWHITE_EX = 107
91
-
92
-
93
- class AnsiStyle(AnsiCodes):
94
- BRIGHT = 1
95
- DIM = 2
96
- NORMAL = 22
97
- RESET_ALL = 0
98
-
99
- Fore = AnsiFore()
100
- Back = AnsiBack()
101
- Style = AnsiStyle()
102
- Cursor = AnsiCursor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/ansitowin32.py DELETED
@@ -1,277 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- import re
3
- import sys
4
- import os
5
-
6
- from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
7
- from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
8
- from .win32 import windll, winapi_test
9
-
10
-
11
- winterm = None
12
- if windll is not None:
13
- winterm = WinTerm()
14
-
15
-
16
- class StreamWrapper(object):
17
- '''
18
- Wraps a stream (such as stdout), acting as a transparent proxy for all
19
- attribute access apart from method 'write()', which is delegated to our
20
- Converter instance.
21
- '''
22
- def __init__(self, wrapped, converter):
23
- # double-underscore everything to prevent clashes with names of
24
- # attributes on the wrapped stream object.
25
- self.__wrapped = wrapped
26
- self.__convertor = converter
27
-
28
- def __getattr__(self, name):
29
- return getattr(self.__wrapped, name)
30
-
31
- def __enter__(self, *args, **kwargs):
32
- # special method lookup bypasses __getattr__/__getattribute__, see
33
- # https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
34
- # thus, contextlib magic methods are not proxied via __getattr__
35
- return self.__wrapped.__enter__(*args, **kwargs)
36
-
37
- def __exit__(self, *args, **kwargs):
38
- return self.__wrapped.__exit__(*args, **kwargs)
39
-
40
- def __setstate__(self, state):
41
- self.__dict__ = state
42
-
43
- def __getstate__(self):
44
- return self.__dict__
45
-
46
- def write(self, text):
47
- self.__convertor.write(text)
48
-
49
- def isatty(self):
50
- stream = self.__wrapped
51
- if 'PYCHARM_HOSTED' in os.environ:
52
- if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
53
- return True
54
- try:
55
- stream_isatty = stream.isatty
56
- except AttributeError:
57
- return False
58
- else:
59
- return stream_isatty()
60
-
61
- @property
62
- def closed(self):
63
- stream = self.__wrapped
64
- try:
65
- return stream.closed
66
- # AttributeError in the case that the stream doesn't support being closed
67
- # ValueError for the case that the stream has already been detached when atexit runs
68
- except (AttributeError, ValueError):
69
- return True
70
-
71
-
72
- class AnsiToWin32(object):
73
- '''
74
- Implements a 'write()' method which, on Windows, will strip ANSI character
75
- sequences from the text, and if outputting to a tty, will convert them into
76
- win32 function calls.
77
- '''
78
- ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
79
- ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
80
-
81
- def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
82
- # The wrapped stream (normally sys.stdout or sys.stderr)
83
- self.wrapped = wrapped
84
-
85
- # should we reset colors to defaults after every .write()
86
- self.autoreset = autoreset
87
-
88
- # create the proxy wrapping our output stream
89
- self.stream = StreamWrapper(wrapped, self)
90
-
91
- on_windows = os.name == 'nt'
92
- # We test if the WinAPI works, because even if we are on Windows
93
- # we may be using a terminal that doesn't support the WinAPI
94
- # (e.g. Cygwin Terminal). In this case it's up to the terminal
95
- # to support the ANSI codes.
96
- conversion_supported = on_windows and winapi_test()
97
- try:
98
- fd = wrapped.fileno()
99
- except Exception:
100
- fd = -1
101
- system_has_native_ansi = not on_windows or enable_vt_processing(fd)
102
- have_tty = not self.stream.closed and self.stream.isatty()
103
- need_conversion = conversion_supported and not system_has_native_ansi
104
-
105
- # should we strip ANSI sequences from our output?
106
- if strip is None:
107
- strip = need_conversion or not have_tty
108
- self.strip = strip
109
-
110
- # should we should convert ANSI sequences into win32 calls?
111
- if convert is None:
112
- convert = need_conversion and have_tty
113
- self.convert = convert
114
-
115
- # dict of ansi codes to win32 functions and parameters
116
- self.win32_calls = self.get_win32_calls()
117
-
118
- # are we wrapping stderr?
119
- self.on_stderr = self.wrapped is sys.stderr
120
-
121
- def should_wrap(self):
122
- '''
123
- True if this class is actually needed. If false, then the output
124
- stream will not be affected, nor will win32 calls be issued, so
125
- wrapping stdout is not actually required. This will generally be
126
- False on non-Windows platforms, unless optional functionality like
127
- autoreset has been requested using kwargs to init()
128
- '''
129
- return self.convert or self.strip or self.autoreset
130
-
131
- def get_win32_calls(self):
132
- if self.convert and winterm:
133
- return {
134
- AnsiStyle.RESET_ALL: (winterm.reset_all, ),
135
- AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
136
- AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
137
- AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
138
- AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
139
- AnsiFore.RED: (winterm.fore, WinColor.RED),
140
- AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
141
- AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
142
- AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
143
- AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
144
- AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
145
- AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
146
- AnsiFore.RESET: (winterm.fore, ),
147
- AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
148
- AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
149
- AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
150
- AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
151
- AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
152
- AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
153
- AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
154
- AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
155
- AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
156
- AnsiBack.RED: (winterm.back, WinColor.RED),
157
- AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
158
- AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
159
- AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
160
- AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
161
- AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
162
- AnsiBack.WHITE: (winterm.back, WinColor.GREY),
163
- AnsiBack.RESET: (winterm.back, ),
164
- AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
165
- AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
166
- AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
167
- AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
168
- AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
169
- AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
170
- AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
171
- AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
172
- }
173
- return dict()
174
-
175
- def write(self, text):
176
- if self.strip or self.convert:
177
- self.write_and_convert(text)
178
- else:
179
- self.wrapped.write(text)
180
- self.wrapped.flush()
181
- if self.autoreset:
182
- self.reset_all()
183
-
184
-
185
- def reset_all(self):
186
- if self.convert:
187
- self.call_win32('m', (0,))
188
- elif not self.strip and not self.stream.closed:
189
- self.wrapped.write(Style.RESET_ALL)
190
-
191
-
192
- def write_and_convert(self, text):
193
- '''
194
- Write the given text to our wrapped stream, stripping any ANSI
195
- sequences from the text, and optionally converting them into win32
196
- calls.
197
- '''
198
- cursor = 0
199
- text = self.convert_osc(text)
200
- for match in self.ANSI_CSI_RE.finditer(text):
201
- start, end = match.span()
202
- self.write_plain_text(text, cursor, start)
203
- self.convert_ansi(*match.groups())
204
- cursor = end
205
- self.write_plain_text(text, cursor, len(text))
206
-
207
-
208
- def write_plain_text(self, text, start, end):
209
- if start < end:
210
- self.wrapped.write(text[start:end])
211
- self.wrapped.flush()
212
-
213
-
214
- def convert_ansi(self, paramstring, command):
215
- if self.convert:
216
- params = self.extract_params(command, paramstring)
217
- self.call_win32(command, params)
218
-
219
-
220
- def extract_params(self, command, paramstring):
221
- if command in 'Hf':
222
- params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
223
- while len(params) < 2:
224
- # defaults:
225
- params = params + (1,)
226
- else:
227
- params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
228
- if len(params) == 0:
229
- # defaults:
230
- if command in 'JKm':
231
- params = (0,)
232
- elif command in 'ABCD':
233
- params = (1,)
234
-
235
- return params
236
-
237
-
238
- def call_win32(self, command, params):
239
- if command == 'm':
240
- for param in params:
241
- if param in self.win32_calls:
242
- func_args = self.win32_calls[param]
243
- func = func_args[0]
244
- args = func_args[1:]
245
- kwargs = dict(on_stderr=self.on_stderr)
246
- func(*args, **kwargs)
247
- elif command in 'J':
248
- winterm.erase_screen(params[0], on_stderr=self.on_stderr)
249
- elif command in 'K':
250
- winterm.erase_line(params[0], on_stderr=self.on_stderr)
251
- elif command in 'Hf': # cursor position - absolute
252
- winterm.set_cursor_position(params, on_stderr=self.on_stderr)
253
- elif command in 'ABCD': # cursor position - relative
254
- n = params[0]
255
- # A - up, B - down, C - forward, D - back
256
- x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
257
- winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
258
-
259
-
260
- def convert_osc(self, text):
261
- for match in self.ANSI_OSC_RE.finditer(text):
262
- start, end = match.span()
263
- text = text[:start] + text[end:]
264
- paramstring, command = match.groups()
265
- if command == BEL:
266
- if paramstring.count(";") == 1:
267
- params = paramstring.split(";")
268
- # 0 - change title and icon (we will only change title)
269
- # 1 - change icon (we don't support this)
270
- # 2 - change title
271
- if params[0] in '02':
272
- winterm.set_title(params[1])
273
- return text
274
-
275
-
276
- def flush(self):
277
- self.wrapped.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/initialise.py DELETED
@@ -1,121 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- import atexit
3
- import contextlib
4
- import sys
5
-
6
- from .ansitowin32 import AnsiToWin32
7
-
8
-
9
- def _wipe_internal_state_for_tests():
10
- global orig_stdout, orig_stderr
11
- orig_stdout = None
12
- orig_stderr = None
13
-
14
- global wrapped_stdout, wrapped_stderr
15
- wrapped_stdout = None
16
- wrapped_stderr = None
17
-
18
- global atexit_done
19
- atexit_done = False
20
-
21
- global fixed_windows_console
22
- fixed_windows_console = False
23
-
24
- try:
25
- # no-op if it wasn't registered
26
- atexit.unregister(reset_all)
27
- except AttributeError:
28
- # python 2: no atexit.unregister. Oh well, we did our best.
29
- pass
30
-
31
-
32
- def reset_all():
33
- if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
34
- AnsiToWin32(orig_stdout).reset_all()
35
-
36
-
37
- def init(autoreset=False, convert=None, strip=None, wrap=True):
38
-
39
- if not wrap and any([autoreset, convert, strip]):
40
- raise ValueError('wrap=False conflicts with any other arg=True')
41
-
42
- global wrapped_stdout, wrapped_stderr
43
- global orig_stdout, orig_stderr
44
-
45
- orig_stdout = sys.stdout
46
- orig_stderr = sys.stderr
47
-
48
- if sys.stdout is None:
49
- wrapped_stdout = None
50
- else:
51
- sys.stdout = wrapped_stdout = \
52
- wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
53
- if sys.stderr is None:
54
- wrapped_stderr = None
55
- else:
56
- sys.stderr = wrapped_stderr = \
57
- wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
58
-
59
- global atexit_done
60
- if not atexit_done:
61
- atexit.register(reset_all)
62
- atexit_done = True
63
-
64
-
65
- def deinit():
66
- if orig_stdout is not None:
67
- sys.stdout = orig_stdout
68
- if orig_stderr is not None:
69
- sys.stderr = orig_stderr
70
-
71
-
72
- def just_fix_windows_console():
73
- global fixed_windows_console
74
-
75
- if sys.platform != "win32":
76
- return
77
- if fixed_windows_console:
78
- return
79
- if wrapped_stdout is not None or wrapped_stderr is not None:
80
- # Someone already ran init() and it did stuff, so we won't second-guess them
81
- return
82
-
83
- # On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
84
- # native ANSI support in the console as a side-effect. We only need to actually
85
- # replace sys.stdout/stderr if we're in the old-style conversion mode.
86
- new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
87
- if new_stdout.convert:
88
- sys.stdout = new_stdout
89
- new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
90
- if new_stderr.convert:
91
- sys.stderr = new_stderr
92
-
93
- fixed_windows_console = True
94
-
95
- @contextlib.contextmanager
96
- def colorama_text(*args, **kwargs):
97
- init(*args, **kwargs)
98
- try:
99
- yield
100
- finally:
101
- deinit()
102
-
103
-
104
- def reinit():
105
- if wrapped_stdout is not None:
106
- sys.stdout = wrapped_stdout
107
- if wrapped_stderr is not None:
108
- sys.stderr = wrapped_stderr
109
-
110
-
111
- def wrap_stream(stream, convert, strip, autoreset, wrap):
112
- if wrap:
113
- wrapper = AnsiToWin32(stream,
114
- convert=convert, strip=strip, autoreset=autoreset)
115
- if wrapper.should_wrap():
116
- stream = wrapper.stream
117
- return stream
118
-
119
-
120
- # Use this for initial setup as well, to reduce code duplication
121
- _wipe_internal_state_for_tests()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/tests/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
 
 
env/Lib/site-packages/colorama/tests/ansi_test.py DELETED
@@ -1,76 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- import sys
3
- from unittest import TestCase, main
4
-
5
- from ..ansi import Back, Fore, Style
6
- from ..ansitowin32 import AnsiToWin32
7
-
8
- stdout_orig = sys.stdout
9
- stderr_orig = sys.stderr
10
-
11
-
12
- class AnsiTest(TestCase):
13
-
14
- def setUp(self):
15
- # sanity check: stdout should be a file or StringIO object.
16
- # It will only be AnsiToWin32 if init() has previously wrapped it
17
- self.assertNotEqual(type(sys.stdout), AnsiToWin32)
18
- self.assertNotEqual(type(sys.stderr), AnsiToWin32)
19
-
20
- def tearDown(self):
21
- sys.stdout = stdout_orig
22
- sys.stderr = stderr_orig
23
-
24
-
25
- def testForeAttributes(self):
26
- self.assertEqual(Fore.BLACK, '\033[30m')
27
- self.assertEqual(Fore.RED, '\033[31m')
28
- self.assertEqual(Fore.GREEN, '\033[32m')
29
- self.assertEqual(Fore.YELLOW, '\033[33m')
30
- self.assertEqual(Fore.BLUE, '\033[34m')
31
- self.assertEqual(Fore.MAGENTA, '\033[35m')
32
- self.assertEqual(Fore.CYAN, '\033[36m')
33
- self.assertEqual(Fore.WHITE, '\033[37m')
34
- self.assertEqual(Fore.RESET, '\033[39m')
35
-
36
- # Check the light, extended versions.
37
- self.assertEqual(Fore.LIGHTBLACK_EX, '\033[90m')
38
- self.assertEqual(Fore.LIGHTRED_EX, '\033[91m')
39
- self.assertEqual(Fore.LIGHTGREEN_EX, '\033[92m')
40
- self.assertEqual(Fore.LIGHTYELLOW_EX, '\033[93m')
41
- self.assertEqual(Fore.LIGHTBLUE_EX, '\033[94m')
42
- self.assertEqual(Fore.LIGHTMAGENTA_EX, '\033[95m')
43
- self.assertEqual(Fore.LIGHTCYAN_EX, '\033[96m')
44
- self.assertEqual(Fore.LIGHTWHITE_EX, '\033[97m')
45
-
46
-
47
- def testBackAttributes(self):
48
- self.assertEqual(Back.BLACK, '\033[40m')
49
- self.assertEqual(Back.RED, '\033[41m')
50
- self.assertEqual(Back.GREEN, '\033[42m')
51
- self.assertEqual(Back.YELLOW, '\033[43m')
52
- self.assertEqual(Back.BLUE, '\033[44m')
53
- self.assertEqual(Back.MAGENTA, '\033[45m')
54
- self.assertEqual(Back.CYAN, '\033[46m')
55
- self.assertEqual(Back.WHITE, '\033[47m')
56
- self.assertEqual(Back.RESET, '\033[49m')
57
-
58
- # Check the light, extended versions.
59
- self.assertEqual(Back.LIGHTBLACK_EX, '\033[100m')
60
- self.assertEqual(Back.LIGHTRED_EX, '\033[101m')
61
- self.assertEqual(Back.LIGHTGREEN_EX, '\033[102m')
62
- self.assertEqual(Back.LIGHTYELLOW_EX, '\033[103m')
63
- self.assertEqual(Back.LIGHTBLUE_EX, '\033[104m')
64
- self.assertEqual(Back.LIGHTMAGENTA_EX, '\033[105m')
65
- self.assertEqual(Back.LIGHTCYAN_EX, '\033[106m')
66
- self.assertEqual(Back.LIGHTWHITE_EX, '\033[107m')
67
-
68
-
69
- def testStyleAttributes(self):
70
- self.assertEqual(Style.DIM, '\033[2m')
71
- self.assertEqual(Style.NORMAL, '\033[22m')
72
- self.assertEqual(Style.BRIGHT, '\033[1m')
73
-
74
-
75
- if __name__ == '__main__':
76
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
env/Lib/site-packages/colorama/tests/ansitowin32_test.py DELETED
@@ -1,294 +0,0 @@
1
- # Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
2
- from io import StringIO, TextIOWrapper
3
- from unittest import TestCase, main
4
- try:
5
- from contextlib import ExitStack
6
- except ImportError:
7
- # python 2
8
- from contextlib2 import ExitStack
9
-
10
- try:
11
- from unittest.mock import MagicMock, Mock, patch
12
- except ImportError:
13
- from mock import MagicMock, Mock, patch
14
-
15
- from ..ansitowin32 import AnsiToWin32, StreamWrapper
16
- from ..win32 import ENABLE_VIRTUAL_TERMINAL_PROCESSING
17
- from .utils import osname
18
-
19
-
20
- class StreamWrapperTest(TestCase):
21
-
22
- def testIsAProxy(self):
23
- mockStream = Mock()
24
- wrapper = StreamWrapper(mockStream, None)
25
- self.assertTrue( wrapper.random_attr is mockStream.random_attr )
26
-
27
- def testDelegatesWrite(self):
28
- mockStream = Mock()
29
- mockConverter = Mock()
30
- wrapper = StreamWrapper(mockStream, mockConverter)
31
- wrapper.write('hello')
32
- self.assertTrue(mockConverter.write.call_args, (('hello',), {}))
33
-
34
- def testDelegatesContext(self):
35
- mockConverter = Mock()
36
- s = StringIO()
37
- with StreamWrapper(s, mockConverter) as fp:
38
- fp.write(u'hello')
39
- self.assertTrue(s.closed)
40
-
41
- def testProxyNoContextManager(self):
42
- mockStream = MagicMock()
43
- mockStream.__enter__.side_effect = AttributeError()
44
- mockConverter = Mock()
45
- with self.assertRaises(AttributeError) as excinfo:
46
- with StreamWrapper(mockStream, mockConverter) as wrapper:
47
- wrapper.write('hello')
48
-
49
- def test_closed_shouldnt_raise_on_closed_stream(self):
50
- stream = StringIO()
51
- stream.close()
52
- wrapper = StreamWrapper(stream, None)
53
- self.assertEqual(wrapper.closed, True)
54
-
55
- def test_closed_shouldnt_raise_on_detached_stream(self):
56
- stream = TextIOWrapper(StringIO())
57
- stream.detach()
58
- wrapper = StreamWrapper(stream, None)
59
- self.assertEqual(wrapper.closed, True)
60
-
61
- class AnsiToWin32Test(TestCase):
62
-
63
- def testInit(self):
64
- mockStdout = Mock()
65
- auto = Mock()
66
- stream = AnsiToWin32(mockStdout, autoreset=auto)
67
- self.assertEqual(stream.wrapped, mockStdout)
68
- self.assertEqual(stream.autoreset, auto)
69
-
70
- @patch('colorama.ansitowin32.winterm', None)
71
- @patch('colorama.ansitowin32.winapi_test', lambda *_: True)
72
- def testStripIsTrueOnWindows(self):
73
- with osname('nt'):
74
- mockStdout = Mock()
75
- stream = AnsiToWin32(mockStdout)
76
- self.assertTrue(stream.strip)
77
-
78
- def testStripIsFalseOffWindows(self):
79
- with osname('posix'):
80
- mockStdout = Mock(closed=False)
81
- stream = AnsiToWin32(mockStdout)
82
- self.assertFalse(stream.strip)
83
-
84
- def testWriteStripsAnsi(self):
85
- mockStdout = Mock()
86
- stream = AnsiToWin32(mockStdout)
87
- stream.wrapped = Mock()
88
- stream.write_and_convert = Mock()
89
- stream.strip = True
90
-
91
- stream.write('abc')
92
-
93
- self.assertFalse(stream.wrapped.write.called)
94
- self.assertEqual(stream.write_and_convert.call_args, (('abc',), {}))
95
-
96
- def testWriteDoesNotStripAnsi(self):
97
- mockStdout = Mock()
98
- stream = AnsiToWin32(mockStdout)
99
- stream.wrapped = Mock()
100
- stream.write_and_convert = Mock()
101
- stream.strip = False
102
- stream.convert = False
103
-
104
- stream.write('abc')
105
-
106
- self.assertFalse(stream.write_and_convert.called)
107
- self.assertEqual(stream.wrapped.write.call_args, (('abc',), {}))
108
-
109
- def assert_autoresets(self, convert, autoreset=True):
110
- stream = AnsiToWin32(Mock())
111
- stream.convert = convert
112
- stream.reset_all = Mock()
113
- stream.autoreset = autoreset
114
- stream.winterm = Mock()
115
-
116
- stream.write('abc')
117
-
118
- self.assertEqual(stream.reset_all.called, autoreset)
119
-
120
- def testWriteAutoresets(self):
121
- self.assert_autoresets(convert=True)
122
- self.assert_autoresets(convert=False)
123
- self.assert_autoresets(convert=True, autoreset=False)
124
- self.assert_autoresets(convert=False, autoreset=False)
125
-
126
- def testWriteAndConvertWritesPlainText(self):
127
- stream = AnsiToWin32(Mock())
128
- stream.write_and_convert( 'abc' )
129
- self.assertEqual( stream.wrapped.write.call_args, (('abc',), {}) )
130
-
131
- def testWriteAndConvertStripsAllValidAnsi(self):
132
- stream = AnsiToWin32(Mock())
133
- stream.call_win32 = Mock()
134
- data = [
135
- 'abc\033[mdef',
136
- 'abc\033[0mdef',
137
- 'abc\033[2mdef',
138
- 'abc\033[02mdef',
139
- 'abc\033[002mdef',
140
- 'abc\033[40mdef',
141
- 'abc\033[040mdef',
142
- 'abc\033[0;1mdef',
143
- 'abc\033[40;50mdef',
144
- 'abc\033[50;30;40mdef',
145
- 'abc\033[Adef',
146
- 'abc\033[0Gdef',
147
- 'abc\033[1;20;128Hdef',
148
- ]
149
- for datum in data:
150
- stream.wrapped.write.reset_mock()
151
- stream.write_and_convert( datum )
152
- self.assertEqual(
153
- [args[0] for args in stream.wrapped.write.call_args_list],
154
- [ ('abc',), ('def',) ]
155
- )
156
-
157
- def testWriteAndConvertSkipsEmptySnippets(self):
158
- stream = AnsiToWin32(Mock())
159
- stream.call_win32 = Mock()
160
- stream.write_and_convert( '\033[40m\033[41m' )
161
- self.assertFalse( stream.wrapped.write.called )
162
-
163
- def testWriteAndConvertCallsWin32WithParamsAndCommand(self):
164
- stream = AnsiToWin32(Mock())
165
- stream.convert = True
166
- stream.call_win32 = Mock()
167
- stream.extract_params = Mock(return_value='params')
168
- data = {
169
- 'abc\033[adef': ('a', 'params'),
170
- 'abc\033[;;bdef': ('b', 'params'),
171
- 'abc\033[0cdef': ('c', 'params'),
172
- 'abc\033[;;0;;Gdef': ('G', 'params'),
173
- 'abc\033[1;20;128Hdef': ('H', 'params'),
174
- }
175
- for datum, expected in data.items():
176
- stream.call_win32.reset_mock()
177
- stream.write_and_convert( datum )
178
- self.assertEqual( stream.call_win32.call_args[0], expected )
179
-
180
- def test_reset_all_shouldnt_raise_on_closed_orig_stdout(self):
181
- stream = StringIO()
182
- converter = AnsiToWin32(stream)
183
- stream.close()
184
-
185
- converter.reset_all()
186
-
187
- def test_wrap_shouldnt_raise_on_closed_orig_stdout(self):
188
- stream = StringIO()
189
- stream.close()
190
- with \
191
- patch("colorama.ansitowin32.os.name", "nt"), \
192
- patch("colorama.ansitowin32.winapi_test", lambda: True):
193
- converter = AnsiToWin32(stream)
194
- self.assertTrue(converter.strip)
195
- self.assertFalse(converter.convert)
196
-
197
- def test_wrap_shouldnt_raise_on_missing_closed_attr(self):
198
- with \
199
- patch("colorama.ansitowin32.os.name", "nt"), \
200
- patch("colorama.ansitowin32.winapi_test", lambda: True):
201
- converter = AnsiToWin32(object())
202
- self.assertTrue(converter.strip)
203
- self.assertFalse(converter.convert)
204
-
205
- def testExtractParams(self):
206
- stream = AnsiToWin32(Mock())
207
- data = {
208
- '': (0,),
209
- ';;': (0,),
210
- '2': (2,),
211
- ';;002;;': (2,),
212
- '0;1': (0, 1),
213
- ';;003;;456;;': (3, 456),
214
- '11;22;33;44;55': (11, 22, 33, 44, 55),
215
- }
216
- for datum, expected in data.items():
217
- self.assertEqual(stream.extract_params('m', datum), expected)
218
-
219
- def testCallWin32UsesLookup(self):
220
- listener = Mock()
221
- stream = AnsiToWin32(listener)
222
- stream.win32_calls = {
223
- 1: (lambda *_, **__: listener(11),),
224
- 2: (lambda *_, **__: listener(22),),
225
- 3: (lambda *_, **__: listener(33),),
226
- }
227
- stream.call_win32('m', (3, 1, 99, 2))
228
- self.assertEqual(
229
- [a[0][0] for a in listener.call_args_list],
230
- [33, 11, 22] )
231
-
232
- def test_osc_codes(self):
233
- mockStdout = Mock()
234
- stream = AnsiToWin32(mockStdout, convert=True)
235
- with patch('colorama.ansitowin32.winterm') as winterm:
236
- data = [
237
- '\033]0\x07', # missing arguments
238
- '\033]0;foo\x08', # wrong OSC command
239
- '\033]0;colorama_test_title\x07', # should work
240
- '\033]1;colorama_test_title\x07', # wrong set command
241
- '\033]2;colorama_test_title\x07', # should work
242
- '\033]' + ';' * 64 + '\x08', # see issue #247
243
- ]
244
- for code in data:
245
- stream.write(code)
246
- self.assertEqual(winterm.set_title.call_count, 2)
247
-
248
- def test_native_windows_ansi(self):
249
- with ExitStack() as stack:
250
- def p(a, b):
251
- stack.enter_context(patch(a, b, create=True))
252
- # Pretend to be on Windows
253
- p("colorama.ansitowin32.os.name", "nt")
254
- p("colorama.ansitowin32.winapi_test", lambda: True)
255
- p("colorama.win32.winapi_test", lambda: True)
256
- p("colorama.winterm.win32.windll", "non-None")
257
- p("colorama.winterm.get_osfhandle", lambda _: 1234)
258
-
259
- # Pretend that our mock stream has native ANSI support
260
- p(
261
- "colorama.winterm.win32.GetConsoleMode",
262
- lambda _: ENABLE_VIRTUAL_TERMINAL_PROCESSING,
263
- )
264
- SetConsoleMode = Mock()
265
- p("colorama.winterm.win32.SetConsoleMode", SetConsoleMode)
266
-
267
- stdout = Mock()
268
- stdout.closed = False
269
- stdout.isatty.return_value = True
270
- stdout.fileno.return_value = 1
271
-
272
- # Our fake console says it has native vt support, so AnsiToWin32 should
273
- # enable that support and do nothing else.
274
- stream = AnsiToWin32(stdout)
275
- SetConsoleMode.assert_called_with(1234, ENABLE_VIRTUAL_TERMINAL_PROCESSING)
276
- self.assertFalse(stream.strip)
277
- self.assertFalse(stream.convert)
278
- self.assertFalse(stream.should_wrap())
279
-
280
- # Now let's pretend we're on an old Windows console, that doesn't have
281
- # native ANSI support.
282
- p("colorama.winterm.win32.GetConsoleMode", lambda _: 0)
283
- SetConsoleMode = Mock()
284
- p("colorama.winterm.win32.SetConsoleMode", SetConsoleMode)
285
-
286
- stream = AnsiToWin32(stdout)
287
- SetConsoleMode.assert_called_with(1234, ENABLE_VIRTUAL_TERMINAL_PROCESSING)
288
- self.assertTrue(stream.strip)
289
- self.assertTrue(stream.convert)
290
- self.assertTrue(stream.should_wrap())
291
-
292
-
293
- if __name__ == '__main__':
294
- main()