Spaces:
Running
Running
Delete env
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +0 -1
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +0 -20
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +0 -46
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +0 -43
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +0 -5
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +0 -2
- env/Lib/site-packages/_yaml/__init__.py +0 -33
- env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +0 -1
- env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +0 -20
- env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +0 -77
- env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +0 -14
- env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +0 -5
- env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +0 -1
- env/Lib/site-packages/certifi/__init__.py +0 -4
- env/Lib/site-packages/certifi/__main__.py +0 -12
- env/Lib/site-packages/certifi/cacert.pem +0 -0
- env/Lib/site-packages/certifi/core.py +0 -114
- env/Lib/site-packages/certifi/py.typed +0 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +0 -1
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +0 -21
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +0 -721
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +0 -35
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +0 -5
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +0 -2
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +0 -1
- env/Lib/site-packages/charset_normalizer/__init__.py +0 -48
- env/Lib/site-packages/charset_normalizer/__main__.py +0 -6
- env/Lib/site-packages/charset_normalizer/api.py +0 -668
- env/Lib/site-packages/charset_normalizer/cd.py +0 -395
- env/Lib/site-packages/charset_normalizer/cli/__init__.py +0 -8
- env/Lib/site-packages/charset_normalizer/cli/__main__.py +0 -321
- env/Lib/site-packages/charset_normalizer/constant.py +0 -1998
- env/Lib/site-packages/charset_normalizer/legacy.py +0 -66
- env/Lib/site-packages/charset_normalizer/md.py +0 -630
- env/Lib/site-packages/charset_normalizer/models.py +0 -360
- env/Lib/site-packages/charset_normalizer/py.typed +0 -0
- env/Lib/site-packages/charset_normalizer/utils.py +0 -408
- env/Lib/site-packages/charset_normalizer/version.py +0 -8
- env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +0 -1
- env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +0 -441
- env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +0 -31
- env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +0 -5
- env/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt +0 -27
- env/Lib/site-packages/colorama/__init__.py +0 -7
- env/Lib/site-packages/colorama/ansi.py +0 -102
- env/Lib/site-packages/colorama/ansitowin32.py +0 -277
- env/Lib/site-packages/colorama/initialise.py +0 -121
- env/Lib/site-packages/colorama/tests/__init__.py +0 -1
- env/Lib/site-packages/colorama/tests/ansi_test.py +0 -76
- env/Lib/site-packages/colorama/tests/ansitowin32_test.py +0 -294
env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
pip
|
|
|
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
Copyright (c) 2017-2021 Ingy döt Net
|
2 |
-
Copyright (c) 2006-2016 Kirill Simonov
|
3 |
-
|
4 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
5 |
-
this software and associated documentation files (the "Software"), to deal in
|
6 |
-
the Software without restriction, including without limitation the rights to
|
7 |
-
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
8 |
-
of the Software, and to permit persons to whom the Software is furnished to do
|
9 |
-
so, subject to the following conditions:
|
10 |
-
|
11 |
-
The above copyright notice and this permission notice shall be included in all
|
12 |
-
copies or substantial portions of the Software.
|
13 |
-
|
14 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
Metadata-Version: 2.1
|
2 |
-
Name: PyYAML
|
3 |
-
Version: 6.0.2
|
4 |
-
Summary: YAML parser and emitter for Python
|
5 |
-
Home-page: https://pyyaml.org/
|
6 |
-
Download-URL: https://pypi.org/project/PyYAML/
|
7 |
-
Author: Kirill Simonov
|
8 |
-
Author-email: [email protected]
|
9 |
-
License: MIT
|
10 |
-
Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
|
11 |
-
Project-URL: CI, https://github.com/yaml/pyyaml/actions
|
12 |
-
Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
|
13 |
-
Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
|
14 |
-
Project-URL: Source Code, https://github.com/yaml/pyyaml
|
15 |
-
Platform: Any
|
16 |
-
Classifier: Development Status :: 5 - Production/Stable
|
17 |
-
Classifier: Intended Audience :: Developers
|
18 |
-
Classifier: License :: OSI Approved :: MIT License
|
19 |
-
Classifier: Operating System :: OS Independent
|
20 |
-
Classifier: Programming Language :: Cython
|
21 |
-
Classifier: Programming Language :: Python
|
22 |
-
Classifier: Programming Language :: Python :: 3
|
23 |
-
Classifier: Programming Language :: Python :: 3.8
|
24 |
-
Classifier: Programming Language :: Python :: 3.9
|
25 |
-
Classifier: Programming Language :: Python :: 3.10
|
26 |
-
Classifier: Programming Language :: Python :: 3.11
|
27 |
-
Classifier: Programming Language :: Python :: 3.12
|
28 |
-
Classifier: Programming Language :: Python :: 3.13
|
29 |
-
Classifier: Programming Language :: Python :: Implementation :: CPython
|
30 |
-
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
31 |
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
32 |
-
Classifier: Topic :: Text Processing :: Markup
|
33 |
-
Requires-Python: >=3.8
|
34 |
-
License-File: LICENSE
|
35 |
-
|
36 |
-
YAML is a data serialization format designed for human readability
|
37 |
-
and interaction with scripting languages. PyYAML is a YAML parser
|
38 |
-
and emitter for Python.
|
39 |
-
|
40 |
-
PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
|
41 |
-
support, capable extension API, and sensible error messages. PyYAML
|
42 |
-
supports standard YAML tags and provides Python-specific tags that
|
43 |
-
allow to represent an arbitrary Python object.
|
44 |
-
|
45 |
-
PyYAML is applicable for a broad range of tasks from complex
|
46 |
-
configuration files to object serialization and persistence.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
-
PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
|
3 |
-
PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
|
4 |
-
PyYAML-6.0.2.dist-info/RECORD,,
|
5 |
-
PyYAML-6.0.2.dist-info/WHEEL,sha256=c7SWG1_hRvc9HXHEkmWlTu1Jr4WpzRucfzqTP-_8q0s,102
|
6 |
-
PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
|
7 |
-
_yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
|
8 |
-
_yaml/__pycache__/__init__.cpython-312.pyc,,
|
9 |
-
yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
|
10 |
-
yaml/__pycache__/__init__.cpython-312.pyc,,
|
11 |
-
yaml/__pycache__/composer.cpython-312.pyc,,
|
12 |
-
yaml/__pycache__/constructor.cpython-312.pyc,,
|
13 |
-
yaml/__pycache__/cyaml.cpython-312.pyc,,
|
14 |
-
yaml/__pycache__/dumper.cpython-312.pyc,,
|
15 |
-
yaml/__pycache__/emitter.cpython-312.pyc,,
|
16 |
-
yaml/__pycache__/error.cpython-312.pyc,,
|
17 |
-
yaml/__pycache__/events.cpython-312.pyc,,
|
18 |
-
yaml/__pycache__/loader.cpython-312.pyc,,
|
19 |
-
yaml/__pycache__/nodes.cpython-312.pyc,,
|
20 |
-
yaml/__pycache__/parser.cpython-312.pyc,,
|
21 |
-
yaml/__pycache__/reader.cpython-312.pyc,,
|
22 |
-
yaml/__pycache__/representer.cpython-312.pyc,,
|
23 |
-
yaml/__pycache__/resolver.cpython-312.pyc,,
|
24 |
-
yaml/__pycache__/scanner.cpython-312.pyc,,
|
25 |
-
yaml/__pycache__/serializer.cpython-312.pyc,,
|
26 |
-
yaml/__pycache__/tokens.cpython-312.pyc,,
|
27 |
-
yaml/_yaml.cp312-win_amd64.pyd,sha256=Bx7e_LEQx7cnd1_A9_nClp3X77g-_Lw1aoAAtYZbwWk,263680
|
28 |
-
yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
|
29 |
-
yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
|
30 |
-
yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
|
31 |
-
yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
|
32 |
-
yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
|
33 |
-
yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
|
34 |
-
yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
|
35 |
-
yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
|
36 |
-
yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
|
37 |
-
yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
|
38 |
-
yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
|
39 |
-
yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
|
40 |
-
yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
|
41 |
-
yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
|
42 |
-
yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
|
43 |
-
yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
Wheel-Version: 1.0
|
2 |
-
Generator: bdist_wheel (0.44.0)
|
3 |
-
Root-Is-Purelib: false
|
4 |
-
Tag: cp312-cp312-win_amd64
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
_yaml
|
2 |
-
yaml
|
|
|
|
|
|
env/Lib/site-packages/_yaml/__init__.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
# This is a stub package designed to roughly emulate the _yaml
|
2 |
-
# extension module, which previously existed as a standalone module
|
3 |
-
# and has been moved into the `yaml` package namespace.
|
4 |
-
# It does not perfectly mimic its old counterpart, but should get
|
5 |
-
# close enough for anyone who's relying on it even when they shouldn't.
|
6 |
-
import yaml
|
7 |
-
|
8 |
-
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
|
9 |
-
# to tread carefully when poking at it here (it may not have the attributes we expect)
|
10 |
-
if not getattr(yaml, '__with_libyaml__', False):
|
11 |
-
from sys import version_info
|
12 |
-
|
13 |
-
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
|
14 |
-
raise exc("No module named '_yaml'")
|
15 |
-
else:
|
16 |
-
from yaml._yaml import *
|
17 |
-
import warnings
|
18 |
-
warnings.warn(
|
19 |
-
'The _yaml extension module is now located at yaml._yaml'
|
20 |
-
' and its location is subject to change. To use the'
|
21 |
-
' LibYAML-based parser and emitter, import from `yaml`:'
|
22 |
-
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
|
23 |
-
DeprecationWarning
|
24 |
-
)
|
25 |
-
del warnings
|
26 |
-
# Don't `del yaml` here because yaml is actually an existing
|
27 |
-
# namespace member of _yaml.
|
28 |
-
|
29 |
-
__name__ = '_yaml'
|
30 |
-
# If the module is top-level (i.e. not a part of any specific package)
|
31 |
-
# then the attribute should be set to ''.
|
32 |
-
# https://docs.python.org/3.8/library/types.html
|
33 |
-
__package__ = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
pip
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
This package contains a modified version of ca-bundle.crt:
|
2 |
-
|
3 |
-
ca-bundle.crt -- Bundle of CA Root Certificates
|
4 |
-
|
5 |
-
This is a bundle of X.509 certificates of public Certificate Authorities
|
6 |
-
(CA). These were automatically extracted from Mozilla's root certificates
|
7 |
-
file (certdata.txt). This file can be found in the mozilla source tree:
|
8 |
-
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
9 |
-
It contains the certificates in PEM format and therefore
|
10 |
-
can be directly used with curl / libcurl / php_curl, or with
|
11 |
-
an Apache+mod_ssl webserver for SSL client authentication.
|
12 |
-
Just configure this file as the SSLCACertificateFile.#
|
13 |
-
|
14 |
-
***** BEGIN LICENSE BLOCK *****
|
15 |
-
This Source Code Form is subject to the terms of the Mozilla Public License,
|
16 |
-
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
17 |
-
one at http://mozilla.org/MPL/2.0/.
|
18 |
-
|
19 |
-
***** END LICENSE BLOCK *****
|
20 |
-
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
Metadata-Version: 2.2
|
2 |
-
Name: certifi
|
3 |
-
Version: 2025.1.31
|
4 |
-
Summary: Python package for providing Mozilla's CA Bundle.
|
5 |
-
Home-page: https://github.com/certifi/python-certifi
|
6 |
-
Author: Kenneth Reitz
|
7 |
-
Author-email: [email protected]
|
8 |
-
License: MPL-2.0
|
9 |
-
Project-URL: Source, https://github.com/certifi/python-certifi
|
10 |
-
Classifier: Development Status :: 5 - Production/Stable
|
11 |
-
Classifier: Intended Audience :: Developers
|
12 |
-
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
13 |
-
Classifier: Natural Language :: English
|
14 |
-
Classifier: Programming Language :: Python
|
15 |
-
Classifier: Programming Language :: Python :: 3
|
16 |
-
Classifier: Programming Language :: Python :: 3 :: Only
|
17 |
-
Classifier: Programming Language :: Python :: 3.6
|
18 |
-
Classifier: Programming Language :: Python :: 3.7
|
19 |
-
Classifier: Programming Language :: Python :: 3.8
|
20 |
-
Classifier: Programming Language :: Python :: 3.9
|
21 |
-
Classifier: Programming Language :: Python :: 3.10
|
22 |
-
Classifier: Programming Language :: Python :: 3.11
|
23 |
-
Classifier: Programming Language :: Python :: 3.12
|
24 |
-
Classifier: Programming Language :: Python :: 3.13
|
25 |
-
Requires-Python: >=3.6
|
26 |
-
License-File: LICENSE
|
27 |
-
Dynamic: author
|
28 |
-
Dynamic: author-email
|
29 |
-
Dynamic: classifier
|
30 |
-
Dynamic: description
|
31 |
-
Dynamic: home-page
|
32 |
-
Dynamic: license
|
33 |
-
Dynamic: project-url
|
34 |
-
Dynamic: requires-python
|
35 |
-
Dynamic: summary
|
36 |
-
|
37 |
-
Certifi: Python SSL Certificates
|
38 |
-
================================
|
39 |
-
|
40 |
-
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
41 |
-
validating the trustworthiness of SSL certificates while verifying the identity
|
42 |
-
of TLS hosts. It has been extracted from the `Requests`_ project.
|
43 |
-
|
44 |
-
Installation
|
45 |
-
------------
|
46 |
-
|
47 |
-
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
48 |
-
|
49 |
-
$ pip install certifi
|
50 |
-
|
51 |
-
Usage
|
52 |
-
-----
|
53 |
-
|
54 |
-
To reference the installed certificate authority (CA) bundle, you can use the
|
55 |
-
built-in function::
|
56 |
-
|
57 |
-
>>> import certifi
|
58 |
-
|
59 |
-
>>> certifi.where()
|
60 |
-
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
61 |
-
|
62 |
-
Or from the command line::
|
63 |
-
|
64 |
-
$ python -m certifi
|
65 |
-
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
66 |
-
|
67 |
-
Enjoy!
|
68 |
-
|
69 |
-
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
70 |
-
|
71 |
-
Addition/Removal of Certificates
|
72 |
-
--------------------------------
|
73 |
-
|
74 |
-
Certifi does not support any addition/removal or other modification of the
|
75 |
-
CA trust store content. This project is intended to provide a reliable and
|
76 |
-
highly portable root of trust to python deployments. Look to upstream projects
|
77 |
-
for methods to use alternate trust.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
-
certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
3 |
-
certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
|
4 |
-
certifi-2025.1.31.dist-info/RECORD,,
|
5 |
-
certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
6 |
-
certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
7 |
-
certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
|
8 |
-
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
9 |
-
certifi/__pycache__/__init__.cpython-312.pyc,,
|
10 |
-
certifi/__pycache__/__main__.cpython-312.pyc,,
|
11 |
-
certifi/__pycache__/core.cpython-312.pyc,,
|
12 |
-
certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
|
13 |
-
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
|
14 |
-
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
Wheel-Version: 1.0
|
2 |
-
Generator: setuptools (75.8.0)
|
3 |
-
Root-Is-Purelib: true
|
4 |
-
Tag: py3-none-any
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
certifi
|
|
|
|
env/Lib/site-packages/certifi/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from .core import contents, where
|
2 |
-
|
3 |
-
__all__ = ["contents", "where"]
|
4 |
-
__version__ = "2025.01.31"
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi/__main__.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
|
3 |
-
from certifi import contents, where
|
4 |
-
|
5 |
-
parser = argparse.ArgumentParser()
|
6 |
-
parser.add_argument("-c", "--contents", action="store_true")
|
7 |
-
args = parser.parse_args()
|
8 |
-
|
9 |
-
if args.contents:
|
10 |
-
print(contents())
|
11 |
-
else:
|
12 |
-
print(where())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi/cacert.pem
DELETED
The diff for this file is too large to render.
See raw diff
|
|
env/Lib/site-packages/certifi/core.py
DELETED
@@ -1,114 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
certifi.py
|
3 |
-
~~~~~~~~~~
|
4 |
-
|
5 |
-
This module returns the installation location of cacert.pem or its contents.
|
6 |
-
"""
|
7 |
-
import sys
|
8 |
-
import atexit
|
9 |
-
|
10 |
-
def exit_cacert_ctx() -> None:
|
11 |
-
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
12 |
-
|
13 |
-
|
14 |
-
if sys.version_info >= (3, 11):
|
15 |
-
|
16 |
-
from importlib.resources import as_file, files
|
17 |
-
|
18 |
-
_CACERT_CTX = None
|
19 |
-
_CACERT_PATH = None
|
20 |
-
|
21 |
-
def where() -> str:
|
22 |
-
# This is slightly terrible, but we want to delay extracting the file
|
23 |
-
# in cases where we're inside of a zipimport situation until someone
|
24 |
-
# actually calls where(), but we don't want to re-extract the file
|
25 |
-
# on every call of where(), so we'll do it once then store it in a
|
26 |
-
# global variable.
|
27 |
-
global _CACERT_CTX
|
28 |
-
global _CACERT_PATH
|
29 |
-
if _CACERT_PATH is None:
|
30 |
-
# This is slightly janky, the importlib.resources API wants you to
|
31 |
-
# manage the cleanup of this file, so it doesn't actually return a
|
32 |
-
# path, it returns a context manager that will give you the path
|
33 |
-
# when you enter it and will do any cleanup when you leave it. In
|
34 |
-
# the common case of not needing a temporary file, it will just
|
35 |
-
# return the file system location and the __exit__() is a no-op.
|
36 |
-
#
|
37 |
-
# We also have to hold onto the actual context manager, because
|
38 |
-
# it will do the cleanup whenever it gets garbage collected, so
|
39 |
-
# we will also store that at the global level as well.
|
40 |
-
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
41 |
-
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
42 |
-
atexit.register(exit_cacert_ctx)
|
43 |
-
|
44 |
-
return _CACERT_PATH
|
45 |
-
|
46 |
-
def contents() -> str:
|
47 |
-
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
48 |
-
|
49 |
-
elif sys.version_info >= (3, 7):
|
50 |
-
|
51 |
-
from importlib.resources import path as get_path, read_text
|
52 |
-
|
53 |
-
_CACERT_CTX = None
|
54 |
-
_CACERT_PATH = None
|
55 |
-
|
56 |
-
def where() -> str:
|
57 |
-
# This is slightly terrible, but we want to delay extracting the
|
58 |
-
# file in cases where we're inside of a zipimport situation until
|
59 |
-
# someone actually calls where(), but we don't want to re-extract
|
60 |
-
# the file on every call of where(), so we'll do it once then store
|
61 |
-
# it in a global variable.
|
62 |
-
global _CACERT_CTX
|
63 |
-
global _CACERT_PATH
|
64 |
-
if _CACERT_PATH is None:
|
65 |
-
# This is slightly janky, the importlib.resources API wants you
|
66 |
-
# to manage the cleanup of this file, so it doesn't actually
|
67 |
-
# return a path, it returns a context manager that will give
|
68 |
-
# you the path when you enter it and will do any cleanup when
|
69 |
-
# you leave it. In the common case of not needing a temporary
|
70 |
-
# file, it will just return the file system location and the
|
71 |
-
# __exit__() is a no-op.
|
72 |
-
#
|
73 |
-
# We also have to hold onto the actual context manager, because
|
74 |
-
# it will do the cleanup whenever it gets garbage collected, so
|
75 |
-
# we will also store that at the global level as well.
|
76 |
-
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
77 |
-
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
78 |
-
atexit.register(exit_cacert_ctx)
|
79 |
-
|
80 |
-
return _CACERT_PATH
|
81 |
-
|
82 |
-
def contents() -> str:
|
83 |
-
return read_text("certifi", "cacert.pem", encoding="ascii")
|
84 |
-
|
85 |
-
else:
|
86 |
-
import os
|
87 |
-
import types
|
88 |
-
from typing import Union
|
89 |
-
|
90 |
-
Package = Union[types.ModuleType, str]
|
91 |
-
Resource = Union[str, "os.PathLike"]
|
92 |
-
|
93 |
-
# This fallback will work for Python versions prior to 3.7 that lack the
|
94 |
-
# importlib.resources module but relies on the existing `where` function
|
95 |
-
# so won't address issues with environments like PyOxidizer that don't set
|
96 |
-
# __file__ on modules.
|
97 |
-
def read_text(
|
98 |
-
package: Package,
|
99 |
-
resource: Resource,
|
100 |
-
encoding: str = 'utf-8',
|
101 |
-
errors: str = 'strict'
|
102 |
-
) -> str:
|
103 |
-
with open(where(), encoding=encoding) as data:
|
104 |
-
return data.read()
|
105 |
-
|
106 |
-
# If we don't have importlib.resources, then we will just do the old logic
|
107 |
-
# of assuming we're on the filesystem and munge the path directly.
|
108 |
-
def where() -> str:
|
109 |
-
f = os.path.dirname(__file__)
|
110 |
-
|
111 |
-
return os.path.join(f, "cacert.pem")
|
112 |
-
|
113 |
-
def contents() -> str:
|
114 |
-
return read_text("certifi", "cacert.pem", encoding="ascii")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/certifi/py.typed
DELETED
File without changes
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
pip
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2025 TAHRI Ahmed R.
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in all
|
13 |
-
copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA
DELETED
@@ -1,721 +0,0 @@
|
|
1 |
-
Metadata-Version: 2.1
|
2 |
-
Name: charset-normalizer
|
3 |
-
Version: 3.4.1
|
4 |
-
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
5 |
-
Author-email: "Ahmed R. TAHRI" <[email protected]>
|
6 |
-
Maintainer-email: "Ahmed R. TAHRI" <[email protected]>
|
7 |
-
License: MIT
|
8 |
-
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
9 |
-
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
10 |
-
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
11 |
-
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
12 |
-
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
13 |
-
Classifier: Development Status :: 5 - Production/Stable
|
14 |
-
Classifier: Intended Audience :: Developers
|
15 |
-
Classifier: License :: OSI Approved :: MIT License
|
16 |
-
Classifier: Operating System :: OS Independent
|
17 |
-
Classifier: Programming Language :: Python
|
18 |
-
Classifier: Programming Language :: Python :: 3
|
19 |
-
Classifier: Programming Language :: Python :: 3.7
|
20 |
-
Classifier: Programming Language :: Python :: 3.8
|
21 |
-
Classifier: Programming Language :: Python :: 3.9
|
22 |
-
Classifier: Programming Language :: Python :: 3.10
|
23 |
-
Classifier: Programming Language :: Python :: 3.11
|
24 |
-
Classifier: Programming Language :: Python :: 3.12
|
25 |
-
Classifier: Programming Language :: Python :: 3.13
|
26 |
-
Classifier: Programming Language :: Python :: 3 :: Only
|
27 |
-
Classifier: Programming Language :: Python :: Implementation :: CPython
|
28 |
-
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
29 |
-
Classifier: Topic :: Text Processing :: Linguistic
|
30 |
-
Classifier: Topic :: Utilities
|
31 |
-
Classifier: Typing :: Typed
|
32 |
-
Requires-Python: >=3.7
|
33 |
-
Description-Content-Type: text/markdown
|
34 |
-
License-File: LICENSE
|
35 |
-
Provides-Extra: unicode-backport
|
36 |
-
|
37 |
-
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
38 |
-
|
39 |
-
<p align="center">
|
40 |
-
<sup>The Real First Universal Charset Detector</sup><br>
|
41 |
-
<a href="https://pypi.org/project/charset-normalizer">
|
42 |
-
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
43 |
-
</a>
|
44 |
-
<a href="https://pepy.tech/project/charset-normalizer/">
|
45 |
-
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
46 |
-
</a>
|
47 |
-
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
48 |
-
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
49 |
-
</a>
|
50 |
-
</p>
|
51 |
-
<p align="center">
|
52 |
-
<sup><i>Featured Packages</i></sup><br>
|
53 |
-
<a href="https://github.com/jawah/niquests">
|
54 |
-
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
|
55 |
-
</a>
|
56 |
-
<a href="https://github.com/jawah/wassima">
|
57 |
-
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
58 |
-
</a>
|
59 |
-
</p>
|
60 |
-
<p align="center">
|
61 |
-
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
62 |
-
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
63 |
-
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
64 |
-
</a>
|
65 |
-
</p>
|
66 |
-
|
67 |
-
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
68 |
-
> I'm trying to resolve the issue by taking a new approach.
|
69 |
-
> All IANA character set names for which the Python core library provides codecs are supported.
|
70 |
-
|
71 |
-
<p align="center">
|
72 |
-
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
73 |
-
</p>
|
74 |
-
|
75 |
-
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
76 |
-
|
77 |
-
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
78 |
-
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
79 |
-
| `Fast` | ❌ | ✅ | ✅ |
|
80 |
-
| `Universal**` | ❌ | ✅ | ❌ |
|
81 |
-
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
82 |
-
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
83 |
-
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
84 |
-
| `Native Python` | ✅ | ✅ | ❌ |
|
85 |
-
| `Detect spoken language` | ❌ | ✅ | N/A |
|
86 |
-
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
87 |
-
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
88 |
-
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
89 |
-
|
90 |
-
<p align="center">
|
91 |
-
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
92 |
-
</p>
|
93 |
-
|
94 |
-
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
95 |
-
|
96 |
-
## ⚡ Performance
|
97 |
-
|
98 |
-
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
99 |
-
|
100 |
-
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
101 |
-
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
102 |
-
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
103 |
-
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
104 |
-
|
105 |
-
| Package | 99th percentile | 95th percentile | 50th percentile |
|
106 |
-
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
107 |
-
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
108 |
-
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
109 |
-
|
110 |
-
_updated as of december 2024 using CPython 3.12_
|
111 |
-
|
112 |
-
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
113 |
-
|
114 |
-
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
115 |
-
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
116 |
-
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
117 |
-
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
118 |
-
> (e.g. Supported Encoding) Challenge-them if you want.
|
119 |
-
|
120 |
-
## ✨ Installation
|
121 |
-
|
122 |
-
Using pip:
|
123 |
-
|
124 |
-
```sh
|
125 |
-
pip install charset-normalizer -U
|
126 |
-
```
|
127 |
-
|
128 |
-
## 🚀 Basic Usage
|
129 |
-
|
130 |
-
### CLI
|
131 |
-
This package comes with a CLI.
|
132 |
-
|
133 |
-
```
|
134 |
-
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
135 |
-
file [file ...]
|
136 |
-
|
137 |
-
The Real First Universal Charset Detector. Discover originating encoding used
|
138 |
-
on text file. Normalize text to unicode.
|
139 |
-
|
140 |
-
positional arguments:
|
141 |
-
files File(s) to be analysed
|
142 |
-
|
143 |
-
optional arguments:
|
144 |
-
-h, --help show this help message and exit
|
145 |
-
-v, --verbose Display complementary information about file if any.
|
146 |
-
Stdout will contain logs about the detection process.
|
147 |
-
-a, --with-alternative
|
148 |
-
Output complementary possibilities if any. Top-level
|
149 |
-
JSON WILL be a list.
|
150 |
-
-n, --normalize Permit to normalize input file. If not set, program
|
151 |
-
does not write anything.
|
152 |
-
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
153 |
-
JSON output.
|
154 |
-
-r, --replace Replace file when trying to normalize it instead of
|
155 |
-
creating a new one.
|
156 |
-
-f, --force Replace file without asking if you are sure, use this
|
157 |
-
flag with caution.
|
158 |
-
-t THRESHOLD, --threshold THRESHOLD
|
159 |
-
Define a custom maximum amount of chaos allowed in
|
160 |
-
decoded content. 0. <= chaos <= 1.
|
161 |
-
--version Show version information and exit.
|
162 |
-
```
|
163 |
-
|
164 |
-
```bash
|
165 |
-
normalizer ./data/sample.1.fr.srt
|
166 |
-
```
|
167 |
-
|
168 |
-
or
|
169 |
-
|
170 |
-
```bash
|
171 |
-
python -m charset_normalizer ./data/sample.1.fr.srt
|
172 |
-
```
|
173 |
-
|
174 |
-
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
175 |
-
|
176 |
-
```json
|
177 |
-
{
|
178 |
-
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
179 |
-
"encoding": "cp1252",
|
180 |
-
"encoding_aliases": [
|
181 |
-
"1252",
|
182 |
-
"windows_1252"
|
183 |
-
],
|
184 |
-
"alternative_encodings": [
|
185 |
-
"cp1254",
|
186 |
-
"cp1256",
|
187 |
-
"cp1258",
|
188 |
-
"iso8859_14",
|
189 |
-
"iso8859_15",
|
190 |
-
"iso8859_16",
|
191 |
-
"iso8859_3",
|
192 |
-
"iso8859_9",
|
193 |
-
"latin_1",
|
194 |
-
"mbcs"
|
195 |
-
],
|
196 |
-
"language": "French",
|
197 |
-
"alphabets": [
|
198 |
-
"Basic Latin",
|
199 |
-
"Latin-1 Supplement"
|
200 |
-
],
|
201 |
-
"has_sig_or_bom": false,
|
202 |
-
"chaos": 0.149,
|
203 |
-
"coherence": 97.152,
|
204 |
-
"unicode_path": null,
|
205 |
-
"is_preferred": true
|
206 |
-
}
|
207 |
-
```
|
208 |
-
|
209 |
-
### Python
|
210 |
-
*Just print out normalized text*
|
211 |
-
```python
|
212 |
-
from charset_normalizer import from_path
|
213 |
-
|
214 |
-
results = from_path('./my_subtitle.srt')
|
215 |
-
|
216 |
-
print(str(results.best()))
|
217 |
-
```
|
218 |
-
|
219 |
-
*Upgrade your code without effort*
|
220 |
-
```python
|
221 |
-
from charset_normalizer import detect
|
222 |
-
```
|
223 |
-
|
224 |
-
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
225 |
-
|
226 |
-
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
227 |
-
|
228 |
-
## 😇 Why
|
229 |
-
|
230 |
-
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
231 |
-
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
232 |
-
|
233 |
-
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
234 |
-
produce **two identical rendered string.**
|
235 |
-
What I want is to get readable text, the best I can.
|
236 |
-
|
237 |
-
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
238 |
-
|
239 |
-
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
240 |
-
|
241 |
-
## 🍰 How
|
242 |
-
|
243 |
-
- Discard all charset encoding table that could not fit the binary content.
|
244 |
-
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
245 |
-
- Extract matches with the lowest mess detected.
|
246 |
-
- Additionally, we measure coherence / probe for a language.
|
247 |
-
|
248 |
-
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
249 |
-
|
250 |
-
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
251 |
-
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
252 |
-
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
253 |
-
improve or rewrite it.
|
254 |
-
|
255 |
-
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
256 |
-
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
257 |
-
|
258 |
-
## ⚡ Known limitations
|
259 |
-
|
260 |
-
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
261 |
-
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
262 |
-
|
263 |
-
## ⚠️ About Python EOLs
|
264 |
-
|
265 |
-
**If you are running:**
|
266 |
-
|
267 |
-
- Python >=2.7,<3.5: Unsupported
|
268 |
-
- Python 3.5: charset-normalizer < 2.1
|
269 |
-
- Python 3.6: charset-normalizer < 3.1
|
270 |
-
- Python 3.7: charset-normalizer < 4.0
|
271 |
-
|
272 |
-
Upgrade your Python interpreter as soon as possible.
|
273 |
-
|
274 |
-
## 👤 Contributing
|
275 |
-
|
276 |
-
Contributions, issues and feature requests are very much welcome.<br />
|
277 |
-
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
278 |
-
|
279 |
-
## 📝 License
|
280 |
-
|
281 |
-
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
282 |
-
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
283 |
-
|
284 |
-
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
285 |
-
|
286 |
-
## 💼 For Enterprise
|
287 |
-
|
288 |
-
Professional support for charset-normalizer is available as part of the [Tidelift
|
289 |
-
Subscription][1]. Tidelift gives software development teams a single source for
|
290 |
-
purchasing and maintaining their software, with professional grade assurances
|
291 |
-
from the experts who know it best, while seamlessly integrating with existing
|
292 |
-
tools.
|
293 |
-
|
294 |
-
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
295 |
-
|
296 |
-
[](https://www.bestpractices.dev/projects/7297)
|
297 |
-
|
298 |
-
# Changelog
|
299 |
-
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
300 |
-
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
301 |
-
|
302 |
-
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
303 |
-
|
304 |
-
### Changed
|
305 |
-
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
306 |
-
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
307 |
-
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
308 |
-
|
309 |
-
### Added
|
310 |
-
- pre-commit configuration.
|
311 |
-
- noxfile.
|
312 |
-
|
313 |
-
### Removed
|
314 |
-
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
315 |
-
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
316 |
-
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
317 |
-
- Unused `utils.range_scan` function.
|
318 |
-
|
319 |
-
### Fixed
|
320 |
-
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
321 |
-
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
322 |
-
|
323 |
-
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
324 |
-
|
325 |
-
### Added
|
326 |
-
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
327 |
-
- Support for Python 3.13 (#512)
|
328 |
-
|
329 |
-
### Fixed
|
330 |
-
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
331 |
-
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
332 |
-
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
333 |
-
|
334 |
-
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
335 |
-
|
336 |
-
### Fixed
|
337 |
-
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
338 |
-
- Regression on some detection case showcased in the documentation (#371)
|
339 |
-
|
340 |
-
### Added
|
341 |
-
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
342 |
-
|
343 |
-
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
344 |
-
|
345 |
-
### Changed
|
346 |
-
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
347 |
-
- Improved the general detection reliability based on reports from the community
|
348 |
-
|
349 |
-
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
350 |
-
|
351 |
-
### Added
|
352 |
-
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
353 |
-
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
354 |
-
|
355 |
-
### Removed
|
356 |
-
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
357 |
-
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
358 |
-
|
359 |
-
### Changed
|
360 |
-
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
361 |
-
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
362 |
-
|
363 |
-
### Fixed
|
364 |
-
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
365 |
-
|
366 |
-
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
367 |
-
|
368 |
-
### Changed
|
369 |
-
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
370 |
-
- Minor improvement over the global detection reliability
|
371 |
-
|
372 |
-
### Added
|
373 |
-
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
374 |
-
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
375 |
-
- Explicit support for Python 3.12
|
376 |
-
|
377 |
-
### Fixed
|
378 |
-
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
379 |
-
|
380 |
-
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
381 |
-
|
382 |
-
### Added
|
383 |
-
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
384 |
-
|
385 |
-
### Removed
|
386 |
-
- Support for Python 3.6 (PR #260)
|
387 |
-
|
388 |
-
### Changed
|
389 |
-
- Optional speedup provided by mypy/c 1.0.1
|
390 |
-
|
391 |
-
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
392 |
-
|
393 |
-
### Fixed
|
394 |
-
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
395 |
-
|
396 |
-
### Changed
|
397 |
-
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
398 |
-
|
399 |
-
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
400 |
-
|
401 |
-
### Added
|
402 |
-
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
403 |
-
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
404 |
-
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
405 |
-
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
406 |
-
|
407 |
-
### Changed
|
408 |
-
- Build with static metadata using 'build' frontend
|
409 |
-
- Make the language detection stricter
|
410 |
-
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
411 |
-
|
412 |
-
### Fixed
|
413 |
-
- CLI with opt --normalize fail when using full path for files
|
414 |
-
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
415 |
-
- Sphinx warnings when generating the documentation
|
416 |
-
|
417 |
-
### Removed
|
418 |
-
- Coherence detector no longer return 'Simple English' instead return 'English'
|
419 |
-
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
420 |
-
- Breaking: Method `first()` and `best()` from CharsetMatch
|
421 |
-
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
422 |
-
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
423 |
-
- Breaking: Top-level function `normalize`
|
424 |
-
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
425 |
-
- Support for the backport `unicodedata2`
|
426 |
-
|
427 |
-
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
428 |
-
|
429 |
-
### Added
|
430 |
-
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
431 |
-
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
432 |
-
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
433 |
-
|
434 |
-
### Changed
|
435 |
-
- Build with static metadata using 'build' frontend
|
436 |
-
- Make the language detection stricter
|
437 |
-
|
438 |
-
### Fixed
|
439 |
-
- CLI with opt --normalize fail when using full path for files
|
440 |
-
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
441 |
-
|
442 |
-
### Removed
|
443 |
-
- Coherence detector no longer return 'Simple English' instead return 'English'
|
444 |
-
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
445 |
-
|
446 |
-
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
447 |
-
|
448 |
-
### Added
|
449 |
-
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
450 |
-
|
451 |
-
### Removed
|
452 |
-
- Breaking: Method `first()` and `best()` from CharsetMatch
|
453 |
-
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
454 |
-
|
455 |
-
### Fixed
|
456 |
-
- Sphinx warnings when generating the documentation
|
457 |
-
|
458 |
-
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
459 |
-
|
460 |
-
### Changed
|
461 |
-
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
462 |
-
|
463 |
-
### Removed
|
464 |
-
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
465 |
-
- Breaking: Top-level function `normalize`
|
466 |
-
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
467 |
-
- Support for the backport `unicodedata2`
|
468 |
-
|
469 |
-
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
470 |
-
|
471 |
-
### Deprecated
|
472 |
-
- Function `normalize` scheduled for removal in 3.0
|
473 |
-
|
474 |
-
### Changed
|
475 |
-
- Removed useless call to decode in fn is_unprintable (#206)
|
476 |
-
|
477 |
-
### Fixed
|
478 |
-
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
479 |
-
|
480 |
-
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
481 |
-
|
482 |
-
### Added
|
483 |
-
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
484 |
-
|
485 |
-
### Changed
|
486 |
-
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
487 |
-
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
488 |
-
|
489 |
-
### Fixed
|
490 |
-
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
491 |
-
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
492 |
-
|
493 |
-
### Removed
|
494 |
-
- Support for Python 3.5 (PR #192)
|
495 |
-
|
496 |
-
### Deprecated
|
497 |
-
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
498 |
-
|
499 |
-
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
500 |
-
|
501 |
-
### Fixed
|
502 |
-
- ASCII miss-detection on rare cases (PR #170)
|
503 |
-
|
504 |
-
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
505 |
-
|
506 |
-
### Added
|
507 |
-
- Explicit support for Python 3.11 (PR #164)
|
508 |
-
|
509 |
-
### Changed
|
510 |
-
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
511 |
-
|
512 |
-
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
513 |
-
|
514 |
-
### Fixed
|
515 |
-
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
516 |
-
|
517 |
-
### Changed
|
518 |
-
- Skipping the language-detection (CD) on ASCII (PR #155)
|
519 |
-
|
520 |
-
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
521 |
-
|
522 |
-
### Changed
|
523 |
-
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
524 |
-
|
525 |
-
### Fixed
|
526 |
-
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
527 |
-
|
528 |
-
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
529 |
-
### Changed
|
530 |
-
- Improvement over Vietnamese detection (PR #126)
|
531 |
-
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
532 |
-
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
533 |
-
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
534 |
-
- Code style as refactored by Sourcery-AI (PR #131)
|
535 |
-
- Minor adjustment on the MD around european words (PR #133)
|
536 |
-
- Remove and replace SRTs from assets / tests (PR #139)
|
537 |
-
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
538 |
-
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
539 |
-
|
540 |
-
### Fixed
|
541 |
-
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
542 |
-
- Avoid using too insignificant chunk (PR #137)
|
543 |
-
|
544 |
-
### Added
|
545 |
-
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
546 |
-
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
547 |
-
|
548 |
-
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
549 |
-
### Added
|
550 |
-
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
551 |
-
|
552 |
-
### Changed
|
553 |
-
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
554 |
-
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
555 |
-
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
556 |
-
- Various detection improvement (MD+CD) (PR #117)
|
557 |
-
|
558 |
-
### Removed
|
559 |
-
- Remove redundant logging entry about detected language(s) (PR #115)
|
560 |
-
|
561 |
-
### Fixed
|
562 |
-
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
563 |
-
|
564 |
-
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
565 |
-
### Fixed
|
566 |
-
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
567 |
-
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
568 |
-
|
569 |
-
### Changed
|
570 |
-
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
571 |
-
|
572 |
-
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
573 |
-
### Changed
|
574 |
-
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
575 |
-
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
576 |
-
- The Unicode detection is slightly improved (PR #93)
|
577 |
-
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
578 |
-
|
579 |
-
### Removed
|
580 |
-
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
581 |
-
|
582 |
-
### Fixed
|
583 |
-
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
584 |
-
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
585 |
-
- The MANIFEST.in was not exhaustive (PR #78)
|
586 |
-
|
587 |
-
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
588 |
-
### Fixed
|
589 |
-
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
590 |
-
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
591 |
-
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
592 |
-
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
593 |
-
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
594 |
-
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
595 |
-
|
596 |
-
### Changed
|
597 |
-
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
598 |
-
- Allow fallback on specified encoding if any (PR #71)
|
599 |
-
|
600 |
-
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
601 |
-
### Changed
|
602 |
-
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
603 |
-
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
604 |
-
|
605 |
-
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
606 |
-
### Fixed
|
607 |
-
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
608 |
-
|
609 |
-
### Changed
|
610 |
-
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
611 |
-
|
612 |
-
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
613 |
-
### Fixed
|
614 |
-
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
615 |
-
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
616 |
-
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
617 |
-
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
618 |
-
|
619 |
-
### Changed
|
620 |
-
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
621 |
-
|
622 |
-
### Added
|
623 |
-
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
624 |
-
|
625 |
-
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
626 |
-
### Changed
|
627 |
-
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
628 |
-
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
629 |
-
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
630 |
-
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
631 |
-
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
632 |
-
- utf_7 detection has been reinstated.
|
633 |
-
|
634 |
-
### Removed
|
635 |
-
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
636 |
-
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
637 |
-
- The exception hook on UnicodeDecodeError has been removed.
|
638 |
-
|
639 |
-
### Deprecated
|
640 |
-
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
641 |
-
|
642 |
-
### Fixed
|
643 |
-
- The CLI output used the relative path of the file(s). Should be absolute.
|
644 |
-
|
645 |
-
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
646 |
-
### Fixed
|
647 |
-
- Logger configuration/usage no longer conflict with others (PR #44)
|
648 |
-
|
649 |
-
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
650 |
-
### Removed
|
651 |
-
- Using standard logging instead of using the package loguru.
|
652 |
-
- Dropping nose test framework in favor of the maintained pytest.
|
653 |
-
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
654 |
-
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
655 |
-
- Stop support for UTF-7 that does not contain a SIG.
|
656 |
-
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
657 |
-
|
658 |
-
### Fixed
|
659 |
-
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
660 |
-
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
661 |
-
|
662 |
-
### Changed
|
663 |
-
- Improving the package final size by compressing frequencies.json.
|
664 |
-
- Huge improvement over the larges payload.
|
665 |
-
|
666 |
-
### Added
|
667 |
-
- CLI now produces JSON consumable output.
|
668 |
-
- Return ASCII if given sequences fit. Given reasonable confidence.
|
669 |
-
|
670 |
-
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
671 |
-
|
672 |
-
### Fixed
|
673 |
-
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
674 |
-
|
675 |
-
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
676 |
-
|
677 |
-
### Fixed
|
678 |
-
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
679 |
-
|
680 |
-
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
681 |
-
|
682 |
-
### Fixed
|
683 |
-
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
684 |
-
|
685 |
-
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
686 |
-
|
687 |
-
### Changed
|
688 |
-
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
689 |
-
|
690 |
-
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
691 |
-
|
692 |
-
### Fixed
|
693 |
-
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
694 |
-
|
695 |
-
### Changed
|
696 |
-
- Dependencies refactoring, constraints revised.
|
697 |
-
|
698 |
-
### Added
|
699 |
-
- Add python 3.9 and 3.10 to the supported interpreters
|
700 |
-
|
701 |
-
MIT License
|
702 |
-
|
703 |
-
Copyright (c) 2025 TAHRI Ahmed R.
|
704 |
-
|
705 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
706 |
-
of this software and associated documentation files (the "Software"), to deal
|
707 |
-
in the Software without restriction, including without limitation the rights
|
708 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
709 |
-
copies of the Software, and to permit persons to whom the Software is
|
710 |
-
furnished to do so, subject to the following conditions:
|
711 |
-
|
712 |
-
The above copyright notice and this permission notice shall be included in all
|
713 |
-
copies or substantial portions of the Software.
|
714 |
-
|
715 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
716 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
717 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
718 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
719 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
720 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
721 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
../../Scripts/normalizer.exe,sha256=aGyf7WAVLi4gHrr8F-d9-4fQG9ifpfMEXEvLwyt8KjI,108411
|
2 |
-
charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
3 |
-
charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
|
4 |
-
charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
|
5 |
-
charset_normalizer-3.4.1.dist-info/RECORD,,
|
6 |
-
charset_normalizer-3.4.1.dist-info/WHEEL,sha256=pWXrJbnZSH-J-PhYmKs2XNn4DHCPNBYq965vsBJBFvA,101
|
7 |
-
charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
|
8 |
-
charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
9 |
-
charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
|
10 |
-
charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
|
11 |
-
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
|
12 |
-
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
|
13 |
-
charset_normalizer/__pycache__/api.cpython-312.pyc,,
|
14 |
-
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
|
15 |
-
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
|
16 |
-
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
|
17 |
-
charset_normalizer/__pycache__/md.cpython-312.pyc,,
|
18 |
-
charset_normalizer/__pycache__/models.cpython-312.pyc,,
|
19 |
-
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
|
20 |
-
charset_normalizer/__pycache__/version.cpython-312.pyc,,
|
21 |
-
charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
|
22 |
-
charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
|
23 |
-
charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
|
24 |
-
charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
|
25 |
-
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
|
26 |
-
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
|
27 |
-
charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
|
28 |
-
charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
|
29 |
-
charset_normalizer/md.cp312-win_amd64.pyd,sha256=XBGy--IKda7c3iBfvw_dovocqb2RSucmVtxvtlG_3tA,10752
|
30 |
-
charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
|
31 |
-
charset_normalizer/md__mypyc.cp312-win_amd64.pyd,sha256=_-jWSji0BgBVvrIHbmabYQNMBF4-xTusdO5mu6P8JsA,125440
|
32 |
-
charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
|
33 |
-
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34 |
-
charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
|
35 |
-
charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
Wheel-Version: 1.0
|
2 |
-
Generator: setuptools (75.6.0)
|
3 |
-
Root-Is-Purelib: false
|
4 |
-
Tag: cp312-cp312-win_amd64
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
[console_scripts]
|
2 |
-
normalizer = charset_normalizer:cli.cli_detect
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
charset_normalizer
|
|
|
|
env/Lib/site-packages/charset_normalizer/__init__.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Charset-Normalizer
|
3 |
-
~~~~~~~~~~~~~~
|
4 |
-
The Real First Universal Charset Detector.
|
5 |
-
A library that helps you read text from an unknown charset encoding.
|
6 |
-
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
7 |
-
All IANA character set names for which the Python core library provides codecs are supported.
|
8 |
-
|
9 |
-
Basic usage:
|
10 |
-
>>> from charset_normalizer import from_bytes
|
11 |
-
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
12 |
-
>>> best_guess = results.best()
|
13 |
-
>>> str(best_guess)
|
14 |
-
'Bсеки човек има право на образование. Oбразованието!'
|
15 |
-
|
16 |
-
Others methods and usages are available - see the full documentation
|
17 |
-
at <https://github.com/Ousret/charset_normalizer>.
|
18 |
-
:copyright: (c) 2021 by Ahmed TAHRI
|
19 |
-
:license: MIT, see LICENSE for more details.
|
20 |
-
"""
|
21 |
-
|
22 |
-
from __future__ import annotations
|
23 |
-
|
24 |
-
import logging
|
25 |
-
|
26 |
-
from .api import from_bytes, from_fp, from_path, is_binary
|
27 |
-
from .legacy import detect
|
28 |
-
from .models import CharsetMatch, CharsetMatches
|
29 |
-
from .utils import set_logging_handler
|
30 |
-
from .version import VERSION, __version__
|
31 |
-
|
32 |
-
__all__ = (
|
33 |
-
"from_fp",
|
34 |
-
"from_path",
|
35 |
-
"from_bytes",
|
36 |
-
"is_binary",
|
37 |
-
"detect",
|
38 |
-
"CharsetMatch",
|
39 |
-
"CharsetMatches",
|
40 |
-
"__version__",
|
41 |
-
"VERSION",
|
42 |
-
"set_logging_handler",
|
43 |
-
)
|
44 |
-
|
45 |
-
# Attach a NullHandler to the top level logger by default
|
46 |
-
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
47 |
-
|
48 |
-
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/__main__.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from .cli import cli_detect
|
4 |
-
|
5 |
-
if __name__ == "__main__":
|
6 |
-
cli_detect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/api.py
DELETED
@@ -1,668 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
import logging
|
4 |
-
from os import PathLike
|
5 |
-
from typing import BinaryIO
|
6 |
-
|
7 |
-
from .cd import (
|
8 |
-
coherence_ratio,
|
9 |
-
encoding_languages,
|
10 |
-
mb_encoding_languages,
|
11 |
-
merge_coherence_ratios,
|
12 |
-
)
|
13 |
-
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
14 |
-
from .md import mess_ratio
|
15 |
-
from .models import CharsetMatch, CharsetMatches
|
16 |
-
from .utils import (
|
17 |
-
any_specified_encoding,
|
18 |
-
cut_sequence_chunks,
|
19 |
-
iana_name,
|
20 |
-
identify_sig_or_bom,
|
21 |
-
is_cp_similar,
|
22 |
-
is_multi_byte_encoding,
|
23 |
-
should_strip_sig_or_bom,
|
24 |
-
)
|
25 |
-
|
26 |
-
logger = logging.getLogger("charset_normalizer")
|
27 |
-
explain_handler = logging.StreamHandler()
|
28 |
-
explain_handler.setFormatter(
|
29 |
-
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
30 |
-
)
|
31 |
-
|
32 |
-
|
33 |
-
def from_bytes(
|
34 |
-
sequences: bytes | bytearray,
|
35 |
-
steps: int = 5,
|
36 |
-
chunk_size: int = 512,
|
37 |
-
threshold: float = 0.2,
|
38 |
-
cp_isolation: list[str] | None = None,
|
39 |
-
cp_exclusion: list[str] | None = None,
|
40 |
-
preemptive_behaviour: bool = True,
|
41 |
-
explain: bool = False,
|
42 |
-
language_threshold: float = 0.1,
|
43 |
-
enable_fallback: bool = True,
|
44 |
-
) -> CharsetMatches:
|
45 |
-
"""
|
46 |
-
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
47 |
-
If there is no results, it is a strong indicator that the source is binary/not text.
|
48 |
-
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
49 |
-
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
50 |
-
|
51 |
-
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
52 |
-
but never take it for granted. Can improve the performance.
|
53 |
-
|
54 |
-
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
55 |
-
purpose.
|
56 |
-
|
57 |
-
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
58 |
-
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
59 |
-
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
60 |
-
Custom logging format and handler can be set manually.
|
61 |
-
"""
|
62 |
-
|
63 |
-
if not isinstance(sequences, (bytearray, bytes)):
|
64 |
-
raise TypeError(
|
65 |
-
"Expected object of type bytes or bytearray, got: {}".format(
|
66 |
-
type(sequences)
|
67 |
-
)
|
68 |
-
)
|
69 |
-
|
70 |
-
if explain:
|
71 |
-
previous_logger_level: int = logger.level
|
72 |
-
logger.addHandler(explain_handler)
|
73 |
-
logger.setLevel(TRACE)
|
74 |
-
|
75 |
-
length: int = len(sequences)
|
76 |
-
|
77 |
-
if length == 0:
|
78 |
-
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
79 |
-
if explain: # Defensive: ensure exit path clean handler
|
80 |
-
logger.removeHandler(explain_handler)
|
81 |
-
logger.setLevel(previous_logger_level or logging.WARNING)
|
82 |
-
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
83 |
-
|
84 |
-
if cp_isolation is not None:
|
85 |
-
logger.log(
|
86 |
-
TRACE,
|
87 |
-
"cp_isolation is set. use this flag for debugging purpose. "
|
88 |
-
"limited list of encoding allowed : %s.",
|
89 |
-
", ".join(cp_isolation),
|
90 |
-
)
|
91 |
-
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
92 |
-
else:
|
93 |
-
cp_isolation = []
|
94 |
-
|
95 |
-
if cp_exclusion is not None:
|
96 |
-
logger.log(
|
97 |
-
TRACE,
|
98 |
-
"cp_exclusion is set. use this flag for debugging purpose. "
|
99 |
-
"limited list of encoding excluded : %s.",
|
100 |
-
", ".join(cp_exclusion),
|
101 |
-
)
|
102 |
-
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
103 |
-
else:
|
104 |
-
cp_exclusion = []
|
105 |
-
|
106 |
-
if length <= (chunk_size * steps):
|
107 |
-
logger.log(
|
108 |
-
TRACE,
|
109 |
-
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
110 |
-
steps,
|
111 |
-
chunk_size,
|
112 |
-
length,
|
113 |
-
)
|
114 |
-
steps = 1
|
115 |
-
chunk_size = length
|
116 |
-
|
117 |
-
if steps > 1 and length / steps < chunk_size:
|
118 |
-
chunk_size = int(length / steps)
|
119 |
-
|
120 |
-
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
121 |
-
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
122 |
-
|
123 |
-
if is_too_small_sequence:
|
124 |
-
logger.log(
|
125 |
-
TRACE,
|
126 |
-
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
127 |
-
length
|
128 |
-
),
|
129 |
-
)
|
130 |
-
elif is_too_large_sequence:
|
131 |
-
logger.log(
|
132 |
-
TRACE,
|
133 |
-
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
134 |
-
length
|
135 |
-
),
|
136 |
-
)
|
137 |
-
|
138 |
-
prioritized_encodings: list[str] = []
|
139 |
-
|
140 |
-
specified_encoding: str | None = (
|
141 |
-
any_specified_encoding(sequences) if preemptive_behaviour else None
|
142 |
-
)
|
143 |
-
|
144 |
-
if specified_encoding is not None:
|
145 |
-
prioritized_encodings.append(specified_encoding)
|
146 |
-
logger.log(
|
147 |
-
TRACE,
|
148 |
-
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
149 |
-
specified_encoding,
|
150 |
-
)
|
151 |
-
|
152 |
-
tested: set[str] = set()
|
153 |
-
tested_but_hard_failure: list[str] = []
|
154 |
-
tested_but_soft_failure: list[str] = []
|
155 |
-
|
156 |
-
fallback_ascii: CharsetMatch | None = None
|
157 |
-
fallback_u8: CharsetMatch | None = None
|
158 |
-
fallback_specified: CharsetMatch | None = None
|
159 |
-
|
160 |
-
results: CharsetMatches = CharsetMatches()
|
161 |
-
|
162 |
-
early_stop_results: CharsetMatches = CharsetMatches()
|
163 |
-
|
164 |
-
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
165 |
-
|
166 |
-
if sig_encoding is not None:
|
167 |
-
prioritized_encodings.append(sig_encoding)
|
168 |
-
logger.log(
|
169 |
-
TRACE,
|
170 |
-
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
171 |
-
len(sig_payload),
|
172 |
-
sig_encoding,
|
173 |
-
)
|
174 |
-
|
175 |
-
prioritized_encodings.append("ascii")
|
176 |
-
|
177 |
-
if "utf_8" not in prioritized_encodings:
|
178 |
-
prioritized_encodings.append("utf_8")
|
179 |
-
|
180 |
-
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
181 |
-
if cp_isolation and encoding_iana not in cp_isolation:
|
182 |
-
continue
|
183 |
-
|
184 |
-
if cp_exclusion and encoding_iana in cp_exclusion:
|
185 |
-
continue
|
186 |
-
|
187 |
-
if encoding_iana in tested:
|
188 |
-
continue
|
189 |
-
|
190 |
-
tested.add(encoding_iana)
|
191 |
-
|
192 |
-
decoded_payload: str | None = None
|
193 |
-
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
194 |
-
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
195 |
-
encoding_iana
|
196 |
-
)
|
197 |
-
|
198 |
-
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
199 |
-
logger.log(
|
200 |
-
TRACE,
|
201 |
-
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
202 |
-
encoding_iana,
|
203 |
-
)
|
204 |
-
continue
|
205 |
-
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
206 |
-
logger.log(
|
207 |
-
TRACE,
|
208 |
-
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
209 |
-
encoding_iana,
|
210 |
-
)
|
211 |
-
continue
|
212 |
-
|
213 |
-
try:
|
214 |
-
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
215 |
-
except (ModuleNotFoundError, ImportError):
|
216 |
-
logger.log(
|
217 |
-
TRACE,
|
218 |
-
"Encoding %s does not provide an IncrementalDecoder",
|
219 |
-
encoding_iana,
|
220 |
-
)
|
221 |
-
continue
|
222 |
-
|
223 |
-
try:
|
224 |
-
if is_too_large_sequence and is_multi_byte_decoder is False:
|
225 |
-
str(
|
226 |
-
(
|
227 |
-
sequences[: int(50e4)]
|
228 |
-
if strip_sig_or_bom is False
|
229 |
-
else sequences[len(sig_payload) : int(50e4)]
|
230 |
-
),
|
231 |
-
encoding=encoding_iana,
|
232 |
-
)
|
233 |
-
else:
|
234 |
-
decoded_payload = str(
|
235 |
-
(
|
236 |
-
sequences
|
237 |
-
if strip_sig_or_bom is False
|
238 |
-
else sequences[len(sig_payload) :]
|
239 |
-
),
|
240 |
-
encoding=encoding_iana,
|
241 |
-
)
|
242 |
-
except (UnicodeDecodeError, LookupError) as e:
|
243 |
-
if not isinstance(e, LookupError):
|
244 |
-
logger.log(
|
245 |
-
TRACE,
|
246 |
-
"Code page %s does not fit given bytes sequence at ALL. %s",
|
247 |
-
encoding_iana,
|
248 |
-
str(e),
|
249 |
-
)
|
250 |
-
tested_but_hard_failure.append(encoding_iana)
|
251 |
-
continue
|
252 |
-
|
253 |
-
similar_soft_failure_test: bool = False
|
254 |
-
|
255 |
-
for encoding_soft_failed in tested_but_soft_failure:
|
256 |
-
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
257 |
-
similar_soft_failure_test = True
|
258 |
-
break
|
259 |
-
|
260 |
-
if similar_soft_failure_test:
|
261 |
-
logger.log(
|
262 |
-
TRACE,
|
263 |
-
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
264 |
-
encoding_iana,
|
265 |
-
encoding_soft_failed,
|
266 |
-
)
|
267 |
-
continue
|
268 |
-
|
269 |
-
r_ = range(
|
270 |
-
0 if not bom_or_sig_available else len(sig_payload),
|
271 |
-
length,
|
272 |
-
int(length / steps),
|
273 |
-
)
|
274 |
-
|
275 |
-
multi_byte_bonus: bool = (
|
276 |
-
is_multi_byte_decoder
|
277 |
-
and decoded_payload is not None
|
278 |
-
and len(decoded_payload) < length
|
279 |
-
)
|
280 |
-
|
281 |
-
if multi_byte_bonus:
|
282 |
-
logger.log(
|
283 |
-
TRACE,
|
284 |
-
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
285 |
-
"was encoded using n-bytes.",
|
286 |
-
encoding_iana,
|
287 |
-
)
|
288 |
-
|
289 |
-
max_chunk_gave_up: int = int(len(r_) / 4)
|
290 |
-
|
291 |
-
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
292 |
-
early_stop_count: int = 0
|
293 |
-
lazy_str_hard_failure = False
|
294 |
-
|
295 |
-
md_chunks: list[str] = []
|
296 |
-
md_ratios = []
|
297 |
-
|
298 |
-
try:
|
299 |
-
for chunk in cut_sequence_chunks(
|
300 |
-
sequences,
|
301 |
-
encoding_iana,
|
302 |
-
r_,
|
303 |
-
chunk_size,
|
304 |
-
bom_or_sig_available,
|
305 |
-
strip_sig_or_bom,
|
306 |
-
sig_payload,
|
307 |
-
is_multi_byte_decoder,
|
308 |
-
decoded_payload,
|
309 |
-
):
|
310 |
-
md_chunks.append(chunk)
|
311 |
-
|
312 |
-
md_ratios.append(
|
313 |
-
mess_ratio(
|
314 |
-
chunk,
|
315 |
-
threshold,
|
316 |
-
explain is True and 1 <= len(cp_isolation) <= 2,
|
317 |
-
)
|
318 |
-
)
|
319 |
-
|
320 |
-
if md_ratios[-1] >= threshold:
|
321 |
-
early_stop_count += 1
|
322 |
-
|
323 |
-
if (early_stop_count >= max_chunk_gave_up) or (
|
324 |
-
bom_or_sig_available and strip_sig_or_bom is False
|
325 |
-
):
|
326 |
-
break
|
327 |
-
except (
|
328 |
-
UnicodeDecodeError
|
329 |
-
) as e: # Lazy str loading may have missed something there
|
330 |
-
logger.log(
|
331 |
-
TRACE,
|
332 |
-
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
333 |
-
encoding_iana,
|
334 |
-
str(e),
|
335 |
-
)
|
336 |
-
early_stop_count = max_chunk_gave_up
|
337 |
-
lazy_str_hard_failure = True
|
338 |
-
|
339 |
-
# We might want to check the sequence again with the whole content
|
340 |
-
# Only if initial MD tests passes
|
341 |
-
if (
|
342 |
-
not lazy_str_hard_failure
|
343 |
-
and is_too_large_sequence
|
344 |
-
and not is_multi_byte_decoder
|
345 |
-
):
|
346 |
-
try:
|
347 |
-
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
348 |
-
except UnicodeDecodeError as e:
|
349 |
-
logger.log(
|
350 |
-
TRACE,
|
351 |
-
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
352 |
-
encoding_iana,
|
353 |
-
str(e),
|
354 |
-
)
|
355 |
-
tested_but_hard_failure.append(encoding_iana)
|
356 |
-
continue
|
357 |
-
|
358 |
-
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
359 |
-
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
360 |
-
tested_but_soft_failure.append(encoding_iana)
|
361 |
-
logger.log(
|
362 |
-
TRACE,
|
363 |
-
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
364 |
-
"Computed mean chaos is %f %%.",
|
365 |
-
encoding_iana,
|
366 |
-
early_stop_count,
|
367 |
-
round(mean_mess_ratio * 100, ndigits=3),
|
368 |
-
)
|
369 |
-
# Preparing those fallbacks in case we got nothing.
|
370 |
-
if (
|
371 |
-
enable_fallback
|
372 |
-
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
373 |
-
and not lazy_str_hard_failure
|
374 |
-
):
|
375 |
-
fallback_entry = CharsetMatch(
|
376 |
-
sequences,
|
377 |
-
encoding_iana,
|
378 |
-
threshold,
|
379 |
-
False,
|
380 |
-
[],
|
381 |
-
decoded_payload,
|
382 |
-
preemptive_declaration=specified_encoding,
|
383 |
-
)
|
384 |
-
if encoding_iana == specified_encoding:
|
385 |
-
fallback_specified = fallback_entry
|
386 |
-
elif encoding_iana == "ascii":
|
387 |
-
fallback_ascii = fallback_entry
|
388 |
-
else:
|
389 |
-
fallback_u8 = fallback_entry
|
390 |
-
continue
|
391 |
-
|
392 |
-
logger.log(
|
393 |
-
TRACE,
|
394 |
-
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
395 |
-
encoding_iana,
|
396 |
-
round(mean_mess_ratio * 100, ndigits=3),
|
397 |
-
)
|
398 |
-
|
399 |
-
if not is_multi_byte_decoder:
|
400 |
-
target_languages: list[str] = encoding_languages(encoding_iana)
|
401 |
-
else:
|
402 |
-
target_languages = mb_encoding_languages(encoding_iana)
|
403 |
-
|
404 |
-
if target_languages:
|
405 |
-
logger.log(
|
406 |
-
TRACE,
|
407 |
-
"{} should target any language(s) of {}".format(
|
408 |
-
encoding_iana, str(target_languages)
|
409 |
-
),
|
410 |
-
)
|
411 |
-
|
412 |
-
cd_ratios = []
|
413 |
-
|
414 |
-
# We shall skip the CD when its about ASCII
|
415 |
-
# Most of the time its not relevant to run "language-detection" on it.
|
416 |
-
if encoding_iana != "ascii":
|
417 |
-
for chunk in md_chunks:
|
418 |
-
chunk_languages = coherence_ratio(
|
419 |
-
chunk,
|
420 |
-
language_threshold,
|
421 |
-
",".join(target_languages) if target_languages else None,
|
422 |
-
)
|
423 |
-
|
424 |
-
cd_ratios.append(chunk_languages)
|
425 |
-
|
426 |
-
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
427 |
-
|
428 |
-
if cd_ratios_merged:
|
429 |
-
logger.log(
|
430 |
-
TRACE,
|
431 |
-
"We detected language {} using {}".format(
|
432 |
-
cd_ratios_merged, encoding_iana
|
433 |
-
),
|
434 |
-
)
|
435 |
-
|
436 |
-
current_match = CharsetMatch(
|
437 |
-
sequences,
|
438 |
-
encoding_iana,
|
439 |
-
mean_mess_ratio,
|
440 |
-
bom_or_sig_available,
|
441 |
-
cd_ratios_merged,
|
442 |
-
(
|
443 |
-
decoded_payload
|
444 |
-
if (
|
445 |
-
is_too_large_sequence is False
|
446 |
-
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
447 |
-
)
|
448 |
-
else None
|
449 |
-
),
|
450 |
-
preemptive_declaration=specified_encoding,
|
451 |
-
)
|
452 |
-
|
453 |
-
results.append(current_match)
|
454 |
-
|
455 |
-
if (
|
456 |
-
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
457 |
-
and mean_mess_ratio < 0.1
|
458 |
-
):
|
459 |
-
# If md says nothing to worry about, then... stop immediately!
|
460 |
-
if mean_mess_ratio == 0.0:
|
461 |
-
logger.debug(
|
462 |
-
"Encoding detection: %s is most likely the one.",
|
463 |
-
current_match.encoding,
|
464 |
-
)
|
465 |
-
if explain: # Defensive: ensure exit path clean handler
|
466 |
-
logger.removeHandler(explain_handler)
|
467 |
-
logger.setLevel(previous_logger_level)
|
468 |
-
return CharsetMatches([current_match])
|
469 |
-
|
470 |
-
early_stop_results.append(current_match)
|
471 |
-
|
472 |
-
if (
|
473 |
-
len(early_stop_results)
|
474 |
-
and (specified_encoding is None or specified_encoding in tested)
|
475 |
-
and "ascii" in tested
|
476 |
-
and "utf_8" in tested
|
477 |
-
):
|
478 |
-
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
479 |
-
logger.debug(
|
480 |
-
"Encoding detection: %s is most likely the one.",
|
481 |
-
probable_result.encoding,
|
482 |
-
)
|
483 |
-
if explain: # Defensive: ensure exit path clean handler
|
484 |
-
logger.removeHandler(explain_handler)
|
485 |
-
logger.setLevel(previous_logger_level)
|
486 |
-
|
487 |
-
return CharsetMatches([probable_result])
|
488 |
-
|
489 |
-
if encoding_iana == sig_encoding:
|
490 |
-
logger.debug(
|
491 |
-
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
492 |
-
"the beginning of the sequence.",
|
493 |
-
encoding_iana,
|
494 |
-
)
|
495 |
-
if explain: # Defensive: ensure exit path clean handler
|
496 |
-
logger.removeHandler(explain_handler)
|
497 |
-
logger.setLevel(previous_logger_level)
|
498 |
-
return CharsetMatches([results[encoding_iana]])
|
499 |
-
|
500 |
-
if len(results) == 0:
|
501 |
-
if fallback_u8 or fallback_ascii or fallback_specified:
|
502 |
-
logger.log(
|
503 |
-
TRACE,
|
504 |
-
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
505 |
-
)
|
506 |
-
|
507 |
-
if fallback_specified:
|
508 |
-
logger.debug(
|
509 |
-
"Encoding detection: %s will be used as a fallback match",
|
510 |
-
fallback_specified.encoding,
|
511 |
-
)
|
512 |
-
results.append(fallback_specified)
|
513 |
-
elif (
|
514 |
-
(fallback_u8 and fallback_ascii is None)
|
515 |
-
or (
|
516 |
-
fallback_u8
|
517 |
-
and fallback_ascii
|
518 |
-
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
519 |
-
)
|
520 |
-
or (fallback_u8 is not None)
|
521 |
-
):
|
522 |
-
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
523 |
-
results.append(fallback_u8)
|
524 |
-
elif fallback_ascii:
|
525 |
-
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
526 |
-
results.append(fallback_ascii)
|
527 |
-
|
528 |
-
if results:
|
529 |
-
logger.debug(
|
530 |
-
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
531 |
-
results.best().encoding, # type: ignore
|
532 |
-
len(results) - 1,
|
533 |
-
)
|
534 |
-
else:
|
535 |
-
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
536 |
-
|
537 |
-
if explain:
|
538 |
-
logger.removeHandler(explain_handler)
|
539 |
-
logger.setLevel(previous_logger_level)
|
540 |
-
|
541 |
-
return results
|
542 |
-
|
543 |
-
|
544 |
-
def from_fp(
|
545 |
-
fp: BinaryIO,
|
546 |
-
steps: int = 5,
|
547 |
-
chunk_size: int = 512,
|
548 |
-
threshold: float = 0.20,
|
549 |
-
cp_isolation: list[str] | None = None,
|
550 |
-
cp_exclusion: list[str] | None = None,
|
551 |
-
preemptive_behaviour: bool = True,
|
552 |
-
explain: bool = False,
|
553 |
-
language_threshold: float = 0.1,
|
554 |
-
enable_fallback: bool = True,
|
555 |
-
) -> CharsetMatches:
|
556 |
-
"""
|
557 |
-
Same thing than the function from_bytes but using a file pointer that is already ready.
|
558 |
-
Will not close the file pointer.
|
559 |
-
"""
|
560 |
-
return from_bytes(
|
561 |
-
fp.read(),
|
562 |
-
steps,
|
563 |
-
chunk_size,
|
564 |
-
threshold,
|
565 |
-
cp_isolation,
|
566 |
-
cp_exclusion,
|
567 |
-
preemptive_behaviour,
|
568 |
-
explain,
|
569 |
-
language_threshold,
|
570 |
-
enable_fallback,
|
571 |
-
)
|
572 |
-
|
573 |
-
|
574 |
-
def from_path(
|
575 |
-
path: str | bytes | PathLike, # type: ignore[type-arg]
|
576 |
-
steps: int = 5,
|
577 |
-
chunk_size: int = 512,
|
578 |
-
threshold: float = 0.20,
|
579 |
-
cp_isolation: list[str] | None = None,
|
580 |
-
cp_exclusion: list[str] | None = None,
|
581 |
-
preemptive_behaviour: bool = True,
|
582 |
-
explain: bool = False,
|
583 |
-
language_threshold: float = 0.1,
|
584 |
-
enable_fallback: bool = True,
|
585 |
-
) -> CharsetMatches:
|
586 |
-
"""
|
587 |
-
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
588 |
-
Can raise IOError.
|
589 |
-
"""
|
590 |
-
with open(path, "rb") as fp:
|
591 |
-
return from_fp(
|
592 |
-
fp,
|
593 |
-
steps,
|
594 |
-
chunk_size,
|
595 |
-
threshold,
|
596 |
-
cp_isolation,
|
597 |
-
cp_exclusion,
|
598 |
-
preemptive_behaviour,
|
599 |
-
explain,
|
600 |
-
language_threshold,
|
601 |
-
enable_fallback,
|
602 |
-
)
|
603 |
-
|
604 |
-
|
605 |
-
def is_binary(
|
606 |
-
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
607 |
-
steps: int = 5,
|
608 |
-
chunk_size: int = 512,
|
609 |
-
threshold: float = 0.20,
|
610 |
-
cp_isolation: list[str] | None = None,
|
611 |
-
cp_exclusion: list[str] | None = None,
|
612 |
-
preemptive_behaviour: bool = True,
|
613 |
-
explain: bool = False,
|
614 |
-
language_threshold: float = 0.1,
|
615 |
-
enable_fallback: bool = False,
|
616 |
-
) -> bool:
|
617 |
-
"""
|
618 |
-
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
619 |
-
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
620 |
-
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
621 |
-
"""
|
622 |
-
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
623 |
-
guesses = from_path(
|
624 |
-
fp_or_path_or_payload,
|
625 |
-
steps=steps,
|
626 |
-
chunk_size=chunk_size,
|
627 |
-
threshold=threshold,
|
628 |
-
cp_isolation=cp_isolation,
|
629 |
-
cp_exclusion=cp_exclusion,
|
630 |
-
preemptive_behaviour=preemptive_behaviour,
|
631 |
-
explain=explain,
|
632 |
-
language_threshold=language_threshold,
|
633 |
-
enable_fallback=enable_fallback,
|
634 |
-
)
|
635 |
-
elif isinstance(
|
636 |
-
fp_or_path_or_payload,
|
637 |
-
(
|
638 |
-
bytes,
|
639 |
-
bytearray,
|
640 |
-
),
|
641 |
-
):
|
642 |
-
guesses = from_bytes(
|
643 |
-
fp_or_path_or_payload,
|
644 |
-
steps=steps,
|
645 |
-
chunk_size=chunk_size,
|
646 |
-
threshold=threshold,
|
647 |
-
cp_isolation=cp_isolation,
|
648 |
-
cp_exclusion=cp_exclusion,
|
649 |
-
preemptive_behaviour=preemptive_behaviour,
|
650 |
-
explain=explain,
|
651 |
-
language_threshold=language_threshold,
|
652 |
-
enable_fallback=enable_fallback,
|
653 |
-
)
|
654 |
-
else:
|
655 |
-
guesses = from_fp(
|
656 |
-
fp_or_path_or_payload,
|
657 |
-
steps=steps,
|
658 |
-
chunk_size=chunk_size,
|
659 |
-
threshold=threshold,
|
660 |
-
cp_isolation=cp_isolation,
|
661 |
-
cp_exclusion=cp_exclusion,
|
662 |
-
preemptive_behaviour=preemptive_behaviour,
|
663 |
-
explain=explain,
|
664 |
-
language_threshold=language_threshold,
|
665 |
-
enable_fallback=enable_fallback,
|
666 |
-
)
|
667 |
-
|
668 |
-
return not guesses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/cd.py
DELETED
@@ -1,395 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
import importlib
|
4 |
-
from codecs import IncrementalDecoder
|
5 |
-
from collections import Counter
|
6 |
-
from functools import lru_cache
|
7 |
-
from typing import Counter as TypeCounter
|
8 |
-
|
9 |
-
from .constant import (
|
10 |
-
FREQUENCIES,
|
11 |
-
KO_NAMES,
|
12 |
-
LANGUAGE_SUPPORTED_COUNT,
|
13 |
-
TOO_SMALL_SEQUENCE,
|
14 |
-
ZH_NAMES,
|
15 |
-
)
|
16 |
-
from .md import is_suspiciously_successive_range
|
17 |
-
from .models import CoherenceMatches
|
18 |
-
from .utils import (
|
19 |
-
is_accentuated,
|
20 |
-
is_latin,
|
21 |
-
is_multi_byte_encoding,
|
22 |
-
is_unicode_range_secondary,
|
23 |
-
unicode_range,
|
24 |
-
)
|
25 |
-
|
26 |
-
|
27 |
-
def encoding_unicode_range(iana_name: str) -> list[str]:
|
28 |
-
"""
|
29 |
-
Return associated unicode ranges in a single byte code page.
|
30 |
-
"""
|
31 |
-
if is_multi_byte_encoding(iana_name):
|
32 |
-
raise OSError("Function not supported on multi-byte code page")
|
33 |
-
|
34 |
-
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
35 |
-
|
36 |
-
p: IncrementalDecoder = decoder(errors="ignore")
|
37 |
-
seen_ranges: dict[str, int] = {}
|
38 |
-
character_count: int = 0
|
39 |
-
|
40 |
-
for i in range(0x40, 0xFF):
|
41 |
-
chunk: str = p.decode(bytes([i]))
|
42 |
-
|
43 |
-
if chunk:
|
44 |
-
character_range: str | None = unicode_range(chunk)
|
45 |
-
|
46 |
-
if character_range is None:
|
47 |
-
continue
|
48 |
-
|
49 |
-
if is_unicode_range_secondary(character_range) is False:
|
50 |
-
if character_range not in seen_ranges:
|
51 |
-
seen_ranges[character_range] = 0
|
52 |
-
seen_ranges[character_range] += 1
|
53 |
-
character_count += 1
|
54 |
-
|
55 |
-
return sorted(
|
56 |
-
[
|
57 |
-
character_range
|
58 |
-
for character_range in seen_ranges
|
59 |
-
if seen_ranges[character_range] / character_count >= 0.15
|
60 |
-
]
|
61 |
-
)
|
62 |
-
|
63 |
-
|
64 |
-
def unicode_range_languages(primary_range: str) -> list[str]:
|
65 |
-
"""
|
66 |
-
Return inferred languages used with a unicode range.
|
67 |
-
"""
|
68 |
-
languages: list[str] = []
|
69 |
-
|
70 |
-
for language, characters in FREQUENCIES.items():
|
71 |
-
for character in characters:
|
72 |
-
if unicode_range(character) == primary_range:
|
73 |
-
languages.append(language)
|
74 |
-
break
|
75 |
-
|
76 |
-
return languages
|
77 |
-
|
78 |
-
|
79 |
-
@lru_cache()
|
80 |
-
def encoding_languages(iana_name: str) -> list[str]:
|
81 |
-
"""
|
82 |
-
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
83 |
-
This function does the correspondence.
|
84 |
-
"""
|
85 |
-
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
86 |
-
primary_range: str | None = None
|
87 |
-
|
88 |
-
for specified_range in unicode_ranges:
|
89 |
-
if "Latin" not in specified_range:
|
90 |
-
primary_range = specified_range
|
91 |
-
break
|
92 |
-
|
93 |
-
if primary_range is None:
|
94 |
-
return ["Latin Based"]
|
95 |
-
|
96 |
-
return unicode_range_languages(primary_range)
|
97 |
-
|
98 |
-
|
99 |
-
@lru_cache()
|
100 |
-
def mb_encoding_languages(iana_name: str) -> list[str]:
|
101 |
-
"""
|
102 |
-
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
103 |
-
This function does the correspondence.
|
104 |
-
"""
|
105 |
-
if (
|
106 |
-
iana_name.startswith("shift_")
|
107 |
-
or iana_name.startswith("iso2022_jp")
|
108 |
-
or iana_name.startswith("euc_j")
|
109 |
-
or iana_name == "cp932"
|
110 |
-
):
|
111 |
-
return ["Japanese"]
|
112 |
-
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
113 |
-
return ["Chinese"]
|
114 |
-
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
115 |
-
return ["Korean"]
|
116 |
-
|
117 |
-
return []
|
118 |
-
|
119 |
-
|
120 |
-
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
121 |
-
def get_target_features(language: str) -> tuple[bool, bool]:
|
122 |
-
"""
|
123 |
-
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
124 |
-
"""
|
125 |
-
target_have_accents: bool = False
|
126 |
-
target_pure_latin: bool = True
|
127 |
-
|
128 |
-
for character in FREQUENCIES[language]:
|
129 |
-
if not target_have_accents and is_accentuated(character):
|
130 |
-
target_have_accents = True
|
131 |
-
if target_pure_latin and is_latin(character) is False:
|
132 |
-
target_pure_latin = False
|
133 |
-
|
134 |
-
return target_have_accents, target_pure_latin
|
135 |
-
|
136 |
-
|
137 |
-
def alphabet_languages(
|
138 |
-
characters: list[str], ignore_non_latin: bool = False
|
139 |
-
) -> list[str]:
|
140 |
-
"""
|
141 |
-
Return associated languages associated to given characters.
|
142 |
-
"""
|
143 |
-
languages: list[tuple[str, float]] = []
|
144 |
-
|
145 |
-
source_have_accents = any(is_accentuated(character) for character in characters)
|
146 |
-
|
147 |
-
for language, language_characters in FREQUENCIES.items():
|
148 |
-
target_have_accents, target_pure_latin = get_target_features(language)
|
149 |
-
|
150 |
-
if ignore_non_latin and target_pure_latin is False:
|
151 |
-
continue
|
152 |
-
|
153 |
-
if target_have_accents is False and source_have_accents:
|
154 |
-
continue
|
155 |
-
|
156 |
-
character_count: int = len(language_characters)
|
157 |
-
|
158 |
-
character_match_count: int = len(
|
159 |
-
[c for c in language_characters if c in characters]
|
160 |
-
)
|
161 |
-
|
162 |
-
ratio: float = character_match_count / character_count
|
163 |
-
|
164 |
-
if ratio >= 0.2:
|
165 |
-
languages.append((language, ratio))
|
166 |
-
|
167 |
-
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
168 |
-
|
169 |
-
return [compatible_language[0] for compatible_language in languages]
|
170 |
-
|
171 |
-
|
172 |
-
def characters_popularity_compare(
|
173 |
-
language: str, ordered_characters: list[str]
|
174 |
-
) -> float:
|
175 |
-
"""
|
176 |
-
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
177 |
-
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
178 |
-
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
179 |
-
"""
|
180 |
-
if language not in FREQUENCIES:
|
181 |
-
raise ValueError(f"{language} not available")
|
182 |
-
|
183 |
-
character_approved_count: int = 0
|
184 |
-
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
185 |
-
|
186 |
-
ordered_characters_count: int = len(ordered_characters)
|
187 |
-
target_language_characters_count: int = len(FREQUENCIES[language])
|
188 |
-
|
189 |
-
large_alphabet: bool = target_language_characters_count > 26
|
190 |
-
|
191 |
-
for character, character_rank in zip(
|
192 |
-
ordered_characters, range(0, ordered_characters_count)
|
193 |
-
):
|
194 |
-
if character not in FREQUENCIES_language_set:
|
195 |
-
continue
|
196 |
-
|
197 |
-
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
198 |
-
expected_projection_ratio: float = (
|
199 |
-
target_language_characters_count / ordered_characters_count
|
200 |
-
)
|
201 |
-
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
202 |
-
|
203 |
-
if (
|
204 |
-
large_alphabet is False
|
205 |
-
and abs(character_rank_projection - character_rank_in_language) > 4
|
206 |
-
):
|
207 |
-
continue
|
208 |
-
|
209 |
-
if (
|
210 |
-
large_alphabet is True
|
211 |
-
and abs(character_rank_projection - character_rank_in_language)
|
212 |
-
< target_language_characters_count / 3
|
213 |
-
):
|
214 |
-
character_approved_count += 1
|
215 |
-
continue
|
216 |
-
|
217 |
-
characters_before_source: list[str] = FREQUENCIES[language][
|
218 |
-
0:character_rank_in_language
|
219 |
-
]
|
220 |
-
characters_after_source: list[str] = FREQUENCIES[language][
|
221 |
-
character_rank_in_language:
|
222 |
-
]
|
223 |
-
characters_before: list[str] = ordered_characters[0:character_rank]
|
224 |
-
characters_after: list[str] = ordered_characters[character_rank:]
|
225 |
-
|
226 |
-
before_match_count: int = len(
|
227 |
-
set(characters_before) & set(characters_before_source)
|
228 |
-
)
|
229 |
-
|
230 |
-
after_match_count: int = len(
|
231 |
-
set(characters_after) & set(characters_after_source)
|
232 |
-
)
|
233 |
-
|
234 |
-
if len(characters_before_source) == 0 and before_match_count <= 4:
|
235 |
-
character_approved_count += 1
|
236 |
-
continue
|
237 |
-
|
238 |
-
if len(characters_after_source) == 0 and after_match_count <= 4:
|
239 |
-
character_approved_count += 1
|
240 |
-
continue
|
241 |
-
|
242 |
-
if (
|
243 |
-
before_match_count / len(characters_before_source) >= 0.4
|
244 |
-
or after_match_count / len(characters_after_source) >= 0.4
|
245 |
-
):
|
246 |
-
character_approved_count += 1
|
247 |
-
continue
|
248 |
-
|
249 |
-
return character_approved_count / len(ordered_characters)
|
250 |
-
|
251 |
-
|
252 |
-
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
253 |
-
"""
|
254 |
-
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
255 |
-
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
256 |
-
One containing the latin letters and the other hebrew.
|
257 |
-
"""
|
258 |
-
layers: dict[str, str] = {}
|
259 |
-
|
260 |
-
for character in decoded_sequence:
|
261 |
-
if character.isalpha() is False:
|
262 |
-
continue
|
263 |
-
|
264 |
-
character_range: str | None = unicode_range(character)
|
265 |
-
|
266 |
-
if character_range is None:
|
267 |
-
continue
|
268 |
-
|
269 |
-
layer_target_range: str | None = None
|
270 |
-
|
271 |
-
for discovered_range in layers:
|
272 |
-
if (
|
273 |
-
is_suspiciously_successive_range(discovered_range, character_range)
|
274 |
-
is False
|
275 |
-
):
|
276 |
-
layer_target_range = discovered_range
|
277 |
-
break
|
278 |
-
|
279 |
-
if layer_target_range is None:
|
280 |
-
layer_target_range = character_range
|
281 |
-
|
282 |
-
if layer_target_range not in layers:
|
283 |
-
layers[layer_target_range] = character.lower()
|
284 |
-
continue
|
285 |
-
|
286 |
-
layers[layer_target_range] += character.lower()
|
287 |
-
|
288 |
-
return list(layers.values())
|
289 |
-
|
290 |
-
|
291 |
-
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
292 |
-
"""
|
293 |
-
This function merge results previously given by the function coherence_ratio.
|
294 |
-
The return type is the same as coherence_ratio.
|
295 |
-
"""
|
296 |
-
per_language_ratios: dict[str, list[float]] = {}
|
297 |
-
for result in results:
|
298 |
-
for sub_result in result:
|
299 |
-
language, ratio = sub_result
|
300 |
-
if language not in per_language_ratios:
|
301 |
-
per_language_ratios[language] = [ratio]
|
302 |
-
continue
|
303 |
-
per_language_ratios[language].append(ratio)
|
304 |
-
|
305 |
-
merge = [
|
306 |
-
(
|
307 |
-
language,
|
308 |
-
round(
|
309 |
-
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
310 |
-
4,
|
311 |
-
),
|
312 |
-
)
|
313 |
-
for language in per_language_ratios
|
314 |
-
]
|
315 |
-
|
316 |
-
return sorted(merge, key=lambda x: x[1], reverse=True)
|
317 |
-
|
318 |
-
|
319 |
-
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
320 |
-
"""
|
321 |
-
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
322 |
-
of "English". This function only keeps the best match and remove the em-dash in it.
|
323 |
-
"""
|
324 |
-
index_results: dict[str, list[float]] = dict()
|
325 |
-
|
326 |
-
for result in results:
|
327 |
-
language, ratio = result
|
328 |
-
no_em_name: str = language.replace("—", "")
|
329 |
-
|
330 |
-
if no_em_name not in index_results:
|
331 |
-
index_results[no_em_name] = []
|
332 |
-
|
333 |
-
index_results[no_em_name].append(ratio)
|
334 |
-
|
335 |
-
if any(len(index_results[e]) > 1 for e in index_results):
|
336 |
-
filtered_results: CoherenceMatches = []
|
337 |
-
|
338 |
-
for language in index_results:
|
339 |
-
filtered_results.append((language, max(index_results[language])))
|
340 |
-
|
341 |
-
return filtered_results
|
342 |
-
|
343 |
-
return results
|
344 |
-
|
345 |
-
|
346 |
-
@lru_cache(maxsize=2048)
|
347 |
-
def coherence_ratio(
|
348 |
-
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
349 |
-
) -> CoherenceMatches:
|
350 |
-
"""
|
351 |
-
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
352 |
-
A layer = Character extraction by alphabets/ranges.
|
353 |
-
"""
|
354 |
-
|
355 |
-
results: list[tuple[str, float]] = []
|
356 |
-
ignore_non_latin: bool = False
|
357 |
-
|
358 |
-
sufficient_match_count: int = 0
|
359 |
-
|
360 |
-
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
361 |
-
if "Latin Based" in lg_inclusion_list:
|
362 |
-
ignore_non_latin = True
|
363 |
-
lg_inclusion_list.remove("Latin Based")
|
364 |
-
|
365 |
-
for layer in alpha_unicode_split(decoded_sequence):
|
366 |
-
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
367 |
-
most_common = sequence_frequencies.most_common()
|
368 |
-
|
369 |
-
character_count: int = sum(o for c, o in most_common)
|
370 |
-
|
371 |
-
if character_count <= TOO_SMALL_SEQUENCE:
|
372 |
-
continue
|
373 |
-
|
374 |
-
popular_character_ordered: list[str] = [c for c, o in most_common]
|
375 |
-
|
376 |
-
for language in lg_inclusion_list or alphabet_languages(
|
377 |
-
popular_character_ordered, ignore_non_latin
|
378 |
-
):
|
379 |
-
ratio: float = characters_popularity_compare(
|
380 |
-
language, popular_character_ordered
|
381 |
-
)
|
382 |
-
|
383 |
-
if ratio < threshold:
|
384 |
-
continue
|
385 |
-
elif ratio >= 0.8:
|
386 |
-
sufficient_match_count += 1
|
387 |
-
|
388 |
-
results.append((language, round(ratio, 4)))
|
389 |
-
|
390 |
-
if sufficient_match_count >= 3:
|
391 |
-
break
|
392 |
-
|
393 |
-
return sorted(
|
394 |
-
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
395 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/cli/__init__.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from .__main__ import cli_detect, query_yes_no
|
4 |
-
|
5 |
-
__all__ = (
|
6 |
-
"cli_detect",
|
7 |
-
"query_yes_no",
|
8 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/cli/__main__.py
DELETED
@@ -1,321 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
import argparse
|
4 |
-
import sys
|
5 |
-
from json import dumps
|
6 |
-
from os.path import abspath, basename, dirname, join, realpath
|
7 |
-
from platform import python_version
|
8 |
-
from unicodedata import unidata_version
|
9 |
-
|
10 |
-
import charset_normalizer.md as md_module
|
11 |
-
from charset_normalizer import from_fp
|
12 |
-
from charset_normalizer.models import CliDetectionResult
|
13 |
-
from charset_normalizer.version import __version__
|
14 |
-
|
15 |
-
|
16 |
-
def query_yes_no(question: str, default: str = "yes") -> bool:
|
17 |
-
"""Ask a yes/no question via input() and return their answer.
|
18 |
-
|
19 |
-
"question" is a string that is presented to the user.
|
20 |
-
"default" is the presumed answer if the user just hits <Enter>.
|
21 |
-
It must be "yes" (the default), "no" or None (meaning
|
22 |
-
an answer is required of the user).
|
23 |
-
|
24 |
-
The "answer" return value is True for "yes" or False for "no".
|
25 |
-
|
26 |
-
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
27 |
-
"""
|
28 |
-
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
29 |
-
if default is None:
|
30 |
-
prompt = " [y/n] "
|
31 |
-
elif default == "yes":
|
32 |
-
prompt = " [Y/n] "
|
33 |
-
elif default == "no":
|
34 |
-
prompt = " [y/N] "
|
35 |
-
else:
|
36 |
-
raise ValueError("invalid default answer: '%s'" % default)
|
37 |
-
|
38 |
-
while True:
|
39 |
-
sys.stdout.write(question + prompt)
|
40 |
-
choice = input().lower()
|
41 |
-
if default is not None and choice == "":
|
42 |
-
return valid[default]
|
43 |
-
elif choice in valid:
|
44 |
-
return valid[choice]
|
45 |
-
else:
|
46 |
-
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
47 |
-
|
48 |
-
|
49 |
-
def cli_detect(argv: list[str] | None = None) -> int:
|
50 |
-
"""
|
51 |
-
CLI assistant using ARGV and ArgumentParser
|
52 |
-
:param argv:
|
53 |
-
:return: 0 if everything is fine, anything else equal trouble
|
54 |
-
"""
|
55 |
-
parser = argparse.ArgumentParser(
|
56 |
-
description="The Real First Universal Charset Detector. "
|
57 |
-
"Discover originating encoding used on text file. "
|
58 |
-
"Normalize text to unicode."
|
59 |
-
)
|
60 |
-
|
61 |
-
parser.add_argument(
|
62 |
-
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
63 |
-
)
|
64 |
-
parser.add_argument(
|
65 |
-
"-v",
|
66 |
-
"--verbose",
|
67 |
-
action="store_true",
|
68 |
-
default=False,
|
69 |
-
dest="verbose",
|
70 |
-
help="Display complementary information about file if any. "
|
71 |
-
"Stdout will contain logs about the detection process.",
|
72 |
-
)
|
73 |
-
parser.add_argument(
|
74 |
-
"-a",
|
75 |
-
"--with-alternative",
|
76 |
-
action="store_true",
|
77 |
-
default=False,
|
78 |
-
dest="alternatives",
|
79 |
-
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
80 |
-
)
|
81 |
-
parser.add_argument(
|
82 |
-
"-n",
|
83 |
-
"--normalize",
|
84 |
-
action="store_true",
|
85 |
-
default=False,
|
86 |
-
dest="normalize",
|
87 |
-
help="Permit to normalize input file. If not set, program does not write anything.",
|
88 |
-
)
|
89 |
-
parser.add_argument(
|
90 |
-
"-m",
|
91 |
-
"--minimal",
|
92 |
-
action="store_true",
|
93 |
-
default=False,
|
94 |
-
dest="minimal",
|
95 |
-
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
96 |
-
)
|
97 |
-
parser.add_argument(
|
98 |
-
"-r",
|
99 |
-
"--replace",
|
100 |
-
action="store_true",
|
101 |
-
default=False,
|
102 |
-
dest="replace",
|
103 |
-
help="Replace file when trying to normalize it instead of creating a new one.",
|
104 |
-
)
|
105 |
-
parser.add_argument(
|
106 |
-
"-f",
|
107 |
-
"--force",
|
108 |
-
action="store_true",
|
109 |
-
default=False,
|
110 |
-
dest="force",
|
111 |
-
help="Replace file without asking if you are sure, use this flag with caution.",
|
112 |
-
)
|
113 |
-
parser.add_argument(
|
114 |
-
"-i",
|
115 |
-
"--no-preemptive",
|
116 |
-
action="store_true",
|
117 |
-
default=False,
|
118 |
-
dest="no_preemptive",
|
119 |
-
help="Disable looking at a charset declaration to hint the detector.",
|
120 |
-
)
|
121 |
-
parser.add_argument(
|
122 |
-
"-t",
|
123 |
-
"--threshold",
|
124 |
-
action="store",
|
125 |
-
default=0.2,
|
126 |
-
type=float,
|
127 |
-
dest="threshold",
|
128 |
-
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
129 |
-
)
|
130 |
-
parser.add_argument(
|
131 |
-
"--version",
|
132 |
-
action="version",
|
133 |
-
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
134 |
-
__version__,
|
135 |
-
python_version(),
|
136 |
-
unidata_version,
|
137 |
-
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
138 |
-
),
|
139 |
-
help="Show version information and exit.",
|
140 |
-
)
|
141 |
-
|
142 |
-
args = parser.parse_args(argv)
|
143 |
-
|
144 |
-
if args.replace is True and args.normalize is False:
|
145 |
-
if args.files:
|
146 |
-
for my_file in args.files:
|
147 |
-
my_file.close()
|
148 |
-
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
149 |
-
return 1
|
150 |
-
|
151 |
-
if args.force is True and args.replace is False:
|
152 |
-
if args.files:
|
153 |
-
for my_file in args.files:
|
154 |
-
my_file.close()
|
155 |
-
print("Use --force in addition of --replace only.", file=sys.stderr)
|
156 |
-
return 1
|
157 |
-
|
158 |
-
if args.threshold < 0.0 or args.threshold > 1.0:
|
159 |
-
if args.files:
|
160 |
-
for my_file in args.files:
|
161 |
-
my_file.close()
|
162 |
-
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
163 |
-
return 1
|
164 |
-
|
165 |
-
x_ = []
|
166 |
-
|
167 |
-
for my_file in args.files:
|
168 |
-
matches = from_fp(
|
169 |
-
my_file,
|
170 |
-
threshold=args.threshold,
|
171 |
-
explain=args.verbose,
|
172 |
-
preemptive_behaviour=args.no_preemptive is False,
|
173 |
-
)
|
174 |
-
|
175 |
-
best_guess = matches.best()
|
176 |
-
|
177 |
-
if best_guess is None:
|
178 |
-
print(
|
179 |
-
'Unable to identify originating encoding for "{}". {}'.format(
|
180 |
-
my_file.name,
|
181 |
-
(
|
182 |
-
"Maybe try increasing maximum amount of chaos."
|
183 |
-
if args.threshold < 1.0
|
184 |
-
else ""
|
185 |
-
),
|
186 |
-
),
|
187 |
-
file=sys.stderr,
|
188 |
-
)
|
189 |
-
x_.append(
|
190 |
-
CliDetectionResult(
|
191 |
-
abspath(my_file.name),
|
192 |
-
None,
|
193 |
-
[],
|
194 |
-
[],
|
195 |
-
"Unknown",
|
196 |
-
[],
|
197 |
-
False,
|
198 |
-
1.0,
|
199 |
-
0.0,
|
200 |
-
None,
|
201 |
-
True,
|
202 |
-
)
|
203 |
-
)
|
204 |
-
else:
|
205 |
-
x_.append(
|
206 |
-
CliDetectionResult(
|
207 |
-
abspath(my_file.name),
|
208 |
-
best_guess.encoding,
|
209 |
-
best_guess.encoding_aliases,
|
210 |
-
[
|
211 |
-
cp
|
212 |
-
for cp in best_guess.could_be_from_charset
|
213 |
-
if cp != best_guess.encoding
|
214 |
-
],
|
215 |
-
best_guess.language,
|
216 |
-
best_guess.alphabets,
|
217 |
-
best_guess.bom,
|
218 |
-
best_guess.percent_chaos,
|
219 |
-
best_guess.percent_coherence,
|
220 |
-
None,
|
221 |
-
True,
|
222 |
-
)
|
223 |
-
)
|
224 |
-
|
225 |
-
if len(matches) > 1 and args.alternatives:
|
226 |
-
for el in matches:
|
227 |
-
if el != best_guess:
|
228 |
-
x_.append(
|
229 |
-
CliDetectionResult(
|
230 |
-
abspath(my_file.name),
|
231 |
-
el.encoding,
|
232 |
-
el.encoding_aliases,
|
233 |
-
[
|
234 |
-
cp
|
235 |
-
for cp in el.could_be_from_charset
|
236 |
-
if cp != el.encoding
|
237 |
-
],
|
238 |
-
el.language,
|
239 |
-
el.alphabets,
|
240 |
-
el.bom,
|
241 |
-
el.percent_chaos,
|
242 |
-
el.percent_coherence,
|
243 |
-
None,
|
244 |
-
False,
|
245 |
-
)
|
246 |
-
)
|
247 |
-
|
248 |
-
if args.normalize is True:
|
249 |
-
if best_guess.encoding.startswith("utf") is True:
|
250 |
-
print(
|
251 |
-
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
252 |
-
my_file.name
|
253 |
-
),
|
254 |
-
file=sys.stderr,
|
255 |
-
)
|
256 |
-
if my_file.closed is False:
|
257 |
-
my_file.close()
|
258 |
-
continue
|
259 |
-
|
260 |
-
dir_path = dirname(realpath(my_file.name))
|
261 |
-
file_name = basename(realpath(my_file.name))
|
262 |
-
|
263 |
-
o_: list[str] = file_name.split(".")
|
264 |
-
|
265 |
-
if args.replace is False:
|
266 |
-
o_.insert(-1, best_guess.encoding)
|
267 |
-
if my_file.closed is False:
|
268 |
-
my_file.close()
|
269 |
-
elif (
|
270 |
-
args.force is False
|
271 |
-
and query_yes_no(
|
272 |
-
'Are you sure to normalize "{}" by replacing it ?'.format(
|
273 |
-
my_file.name
|
274 |
-
),
|
275 |
-
"no",
|
276 |
-
)
|
277 |
-
is False
|
278 |
-
):
|
279 |
-
if my_file.closed is False:
|
280 |
-
my_file.close()
|
281 |
-
continue
|
282 |
-
|
283 |
-
try:
|
284 |
-
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
285 |
-
|
286 |
-
with open(x_[0].unicode_path, "wb") as fp:
|
287 |
-
fp.write(best_guess.output())
|
288 |
-
except OSError as e:
|
289 |
-
print(str(e), file=sys.stderr)
|
290 |
-
if my_file.closed is False:
|
291 |
-
my_file.close()
|
292 |
-
return 2
|
293 |
-
|
294 |
-
if my_file.closed is False:
|
295 |
-
my_file.close()
|
296 |
-
|
297 |
-
if args.minimal is False:
|
298 |
-
print(
|
299 |
-
dumps(
|
300 |
-
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
301 |
-
ensure_ascii=True,
|
302 |
-
indent=4,
|
303 |
-
)
|
304 |
-
)
|
305 |
-
else:
|
306 |
-
for my_file in args.files:
|
307 |
-
print(
|
308 |
-
", ".join(
|
309 |
-
[
|
310 |
-
el.encoding or "undefined"
|
311 |
-
for el in x_
|
312 |
-
if el.path == abspath(my_file.name)
|
313 |
-
]
|
314 |
-
)
|
315 |
-
)
|
316 |
-
|
317 |
-
return 0
|
318 |
-
|
319 |
-
|
320 |
-
if __name__ == "__main__":
|
321 |
-
cli_detect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/constant.py
DELETED
@@ -1,1998 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
4 |
-
from encodings.aliases import aliases
|
5 |
-
from re import IGNORECASE
|
6 |
-
from re import compile as re_compile
|
7 |
-
|
8 |
-
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
9 |
-
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
|
10 |
-
"utf_8": BOM_UTF8,
|
11 |
-
"utf_7": [
|
12 |
-
b"\x2b\x2f\x76\x38",
|
13 |
-
b"\x2b\x2f\x76\x39",
|
14 |
-
b"\x2b\x2f\x76\x2b",
|
15 |
-
b"\x2b\x2f\x76\x2f",
|
16 |
-
b"\x2b\x2f\x76\x38\x2d",
|
17 |
-
],
|
18 |
-
"gb18030": b"\x84\x31\x95\x33",
|
19 |
-
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
20 |
-
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
21 |
-
}
|
22 |
-
|
23 |
-
TOO_SMALL_SEQUENCE: int = 32
|
24 |
-
TOO_BIG_SEQUENCE: int = int(10e6)
|
25 |
-
|
26 |
-
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
27 |
-
|
28 |
-
# Up-to-date Unicode ucd/15.0.0
|
29 |
-
UNICODE_RANGES_COMBINED: dict[str, range] = {
|
30 |
-
"Control character": range(32),
|
31 |
-
"Basic Latin": range(32, 128),
|
32 |
-
"Latin-1 Supplement": range(128, 256),
|
33 |
-
"Latin Extended-A": range(256, 384),
|
34 |
-
"Latin Extended-B": range(384, 592),
|
35 |
-
"IPA Extensions": range(592, 688),
|
36 |
-
"Spacing Modifier Letters": range(688, 768),
|
37 |
-
"Combining Diacritical Marks": range(768, 880),
|
38 |
-
"Greek and Coptic": range(880, 1024),
|
39 |
-
"Cyrillic": range(1024, 1280),
|
40 |
-
"Cyrillic Supplement": range(1280, 1328),
|
41 |
-
"Armenian": range(1328, 1424),
|
42 |
-
"Hebrew": range(1424, 1536),
|
43 |
-
"Arabic": range(1536, 1792),
|
44 |
-
"Syriac": range(1792, 1872),
|
45 |
-
"Arabic Supplement": range(1872, 1920),
|
46 |
-
"Thaana": range(1920, 1984),
|
47 |
-
"NKo": range(1984, 2048),
|
48 |
-
"Samaritan": range(2048, 2112),
|
49 |
-
"Mandaic": range(2112, 2144),
|
50 |
-
"Syriac Supplement": range(2144, 2160),
|
51 |
-
"Arabic Extended-B": range(2160, 2208),
|
52 |
-
"Arabic Extended-A": range(2208, 2304),
|
53 |
-
"Devanagari": range(2304, 2432),
|
54 |
-
"Bengali": range(2432, 2560),
|
55 |
-
"Gurmukhi": range(2560, 2688),
|
56 |
-
"Gujarati": range(2688, 2816),
|
57 |
-
"Oriya": range(2816, 2944),
|
58 |
-
"Tamil": range(2944, 3072),
|
59 |
-
"Telugu": range(3072, 3200),
|
60 |
-
"Kannada": range(3200, 3328),
|
61 |
-
"Malayalam": range(3328, 3456),
|
62 |
-
"Sinhala": range(3456, 3584),
|
63 |
-
"Thai": range(3584, 3712),
|
64 |
-
"Lao": range(3712, 3840),
|
65 |
-
"Tibetan": range(3840, 4096),
|
66 |
-
"Myanmar": range(4096, 4256),
|
67 |
-
"Georgian": range(4256, 4352),
|
68 |
-
"Hangul Jamo": range(4352, 4608),
|
69 |
-
"Ethiopic": range(4608, 4992),
|
70 |
-
"Ethiopic Supplement": range(4992, 5024),
|
71 |
-
"Cherokee": range(5024, 5120),
|
72 |
-
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
73 |
-
"Ogham": range(5760, 5792),
|
74 |
-
"Runic": range(5792, 5888),
|
75 |
-
"Tagalog": range(5888, 5920),
|
76 |
-
"Hanunoo": range(5920, 5952),
|
77 |
-
"Buhid": range(5952, 5984),
|
78 |
-
"Tagbanwa": range(5984, 6016),
|
79 |
-
"Khmer": range(6016, 6144),
|
80 |
-
"Mongolian": range(6144, 6320),
|
81 |
-
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
82 |
-
"Limbu": range(6400, 6480),
|
83 |
-
"Tai Le": range(6480, 6528),
|
84 |
-
"New Tai Lue": range(6528, 6624),
|
85 |
-
"Khmer Symbols": range(6624, 6656),
|
86 |
-
"Buginese": range(6656, 6688),
|
87 |
-
"Tai Tham": range(6688, 6832),
|
88 |
-
"Combining Diacritical Marks Extended": range(6832, 6912),
|
89 |
-
"Balinese": range(6912, 7040),
|
90 |
-
"Sundanese": range(7040, 7104),
|
91 |
-
"Batak": range(7104, 7168),
|
92 |
-
"Lepcha": range(7168, 7248),
|
93 |
-
"Ol Chiki": range(7248, 7296),
|
94 |
-
"Cyrillic Extended-C": range(7296, 7312),
|
95 |
-
"Georgian Extended": range(7312, 7360),
|
96 |
-
"Sundanese Supplement": range(7360, 7376),
|
97 |
-
"Vedic Extensions": range(7376, 7424),
|
98 |
-
"Phonetic Extensions": range(7424, 7552),
|
99 |
-
"Phonetic Extensions Supplement": range(7552, 7616),
|
100 |
-
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
101 |
-
"Latin Extended Additional": range(7680, 7936),
|
102 |
-
"Greek Extended": range(7936, 8192),
|
103 |
-
"General Punctuation": range(8192, 8304),
|
104 |
-
"Superscripts and Subscripts": range(8304, 8352),
|
105 |
-
"Currency Symbols": range(8352, 8400),
|
106 |
-
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
107 |
-
"Letterlike Symbols": range(8448, 8528),
|
108 |
-
"Number Forms": range(8528, 8592),
|
109 |
-
"Arrows": range(8592, 8704),
|
110 |
-
"Mathematical Operators": range(8704, 8960),
|
111 |
-
"Miscellaneous Technical": range(8960, 9216),
|
112 |
-
"Control Pictures": range(9216, 9280),
|
113 |
-
"Optical Character Recognition": range(9280, 9312),
|
114 |
-
"Enclosed Alphanumerics": range(9312, 9472),
|
115 |
-
"Box Drawing": range(9472, 9600),
|
116 |
-
"Block Elements": range(9600, 9632),
|
117 |
-
"Geometric Shapes": range(9632, 9728),
|
118 |
-
"Miscellaneous Symbols": range(9728, 9984),
|
119 |
-
"Dingbats": range(9984, 10176),
|
120 |
-
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
121 |
-
"Supplemental Arrows-A": range(10224, 10240),
|
122 |
-
"Braille Patterns": range(10240, 10496),
|
123 |
-
"Supplemental Arrows-B": range(10496, 10624),
|
124 |
-
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
125 |
-
"Supplemental Mathematical Operators": range(10752, 11008),
|
126 |
-
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
127 |
-
"Glagolitic": range(11264, 11360),
|
128 |
-
"Latin Extended-C": range(11360, 11392),
|
129 |
-
"Coptic": range(11392, 11520),
|
130 |
-
"Georgian Supplement": range(11520, 11568),
|
131 |
-
"Tifinagh": range(11568, 11648),
|
132 |
-
"Ethiopic Extended": range(11648, 11744),
|
133 |
-
"Cyrillic Extended-A": range(11744, 11776),
|
134 |
-
"Supplemental Punctuation": range(11776, 11904),
|
135 |
-
"CJK Radicals Supplement": range(11904, 12032),
|
136 |
-
"Kangxi Radicals": range(12032, 12256),
|
137 |
-
"Ideographic Description Characters": range(12272, 12288),
|
138 |
-
"CJK Symbols and Punctuation": range(12288, 12352),
|
139 |
-
"Hiragana": range(12352, 12448),
|
140 |
-
"Katakana": range(12448, 12544),
|
141 |
-
"Bopomofo": range(12544, 12592),
|
142 |
-
"Hangul Compatibility Jamo": range(12592, 12688),
|
143 |
-
"Kanbun": range(12688, 12704),
|
144 |
-
"Bopomofo Extended": range(12704, 12736),
|
145 |
-
"CJK Strokes": range(12736, 12784),
|
146 |
-
"Katakana Phonetic Extensions": range(12784, 12800),
|
147 |
-
"Enclosed CJK Letters and Months": range(12800, 13056),
|
148 |
-
"CJK Compatibility": range(13056, 13312),
|
149 |
-
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
150 |
-
"Yijing Hexagram Symbols": range(19904, 19968),
|
151 |
-
"CJK Unified Ideographs": range(19968, 40960),
|
152 |
-
"Yi Syllables": range(40960, 42128),
|
153 |
-
"Yi Radicals": range(42128, 42192),
|
154 |
-
"Lisu": range(42192, 42240),
|
155 |
-
"Vai": range(42240, 42560),
|
156 |
-
"Cyrillic Extended-B": range(42560, 42656),
|
157 |
-
"Bamum": range(42656, 42752),
|
158 |
-
"Modifier Tone Letters": range(42752, 42784),
|
159 |
-
"Latin Extended-D": range(42784, 43008),
|
160 |
-
"Syloti Nagri": range(43008, 43056),
|
161 |
-
"Common Indic Number Forms": range(43056, 43072),
|
162 |
-
"Phags-pa": range(43072, 43136),
|
163 |
-
"Saurashtra": range(43136, 43232),
|
164 |
-
"Devanagari Extended": range(43232, 43264),
|
165 |
-
"Kayah Li": range(43264, 43312),
|
166 |
-
"Rejang": range(43312, 43360),
|
167 |
-
"Hangul Jamo Extended-A": range(43360, 43392),
|
168 |
-
"Javanese": range(43392, 43488),
|
169 |
-
"Myanmar Extended-B": range(43488, 43520),
|
170 |
-
"Cham": range(43520, 43616),
|
171 |
-
"Myanmar Extended-A": range(43616, 43648),
|
172 |
-
"Tai Viet": range(43648, 43744),
|
173 |
-
"Meetei Mayek Extensions": range(43744, 43776),
|
174 |
-
"Ethiopic Extended-A": range(43776, 43824),
|
175 |
-
"Latin Extended-E": range(43824, 43888),
|
176 |
-
"Cherokee Supplement": range(43888, 43968),
|
177 |
-
"Meetei Mayek": range(43968, 44032),
|
178 |
-
"Hangul Syllables": range(44032, 55216),
|
179 |
-
"Hangul Jamo Extended-B": range(55216, 55296),
|
180 |
-
"High Surrogates": range(55296, 56192),
|
181 |
-
"High Private Use Surrogates": range(56192, 56320),
|
182 |
-
"Low Surrogates": range(56320, 57344),
|
183 |
-
"Private Use Area": range(57344, 63744),
|
184 |
-
"CJK Compatibility Ideographs": range(63744, 64256),
|
185 |
-
"Alphabetic Presentation Forms": range(64256, 64336),
|
186 |
-
"Arabic Presentation Forms-A": range(64336, 65024),
|
187 |
-
"Variation Selectors": range(65024, 65040),
|
188 |
-
"Vertical Forms": range(65040, 65056),
|
189 |
-
"Combining Half Marks": range(65056, 65072),
|
190 |
-
"CJK Compatibility Forms": range(65072, 65104),
|
191 |
-
"Small Form Variants": range(65104, 65136),
|
192 |
-
"Arabic Presentation Forms-B": range(65136, 65280),
|
193 |
-
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
194 |
-
"Specials": range(65520, 65536),
|
195 |
-
"Linear B Syllabary": range(65536, 65664),
|
196 |
-
"Linear B Ideograms": range(65664, 65792),
|
197 |
-
"Aegean Numbers": range(65792, 65856),
|
198 |
-
"Ancient Greek Numbers": range(65856, 65936),
|
199 |
-
"Ancient Symbols": range(65936, 66000),
|
200 |
-
"Phaistos Disc": range(66000, 66048),
|
201 |
-
"Lycian": range(66176, 66208),
|
202 |
-
"Carian": range(66208, 66272),
|
203 |
-
"Coptic Epact Numbers": range(66272, 66304),
|
204 |
-
"Old Italic": range(66304, 66352),
|
205 |
-
"Gothic": range(66352, 66384),
|
206 |
-
"Old Permic": range(66384, 66432),
|
207 |
-
"Ugaritic": range(66432, 66464),
|
208 |
-
"Old Persian": range(66464, 66528),
|
209 |
-
"Deseret": range(66560, 66640),
|
210 |
-
"Shavian": range(66640, 66688),
|
211 |
-
"Osmanya": range(66688, 66736),
|
212 |
-
"Osage": range(66736, 66816),
|
213 |
-
"Elbasan": range(66816, 66864),
|
214 |
-
"Caucasian Albanian": range(66864, 66928),
|
215 |
-
"Vithkuqi": range(66928, 67008),
|
216 |
-
"Linear A": range(67072, 67456),
|
217 |
-
"Latin Extended-F": range(67456, 67520),
|
218 |
-
"Cypriot Syllabary": range(67584, 67648),
|
219 |
-
"Imperial Aramaic": range(67648, 67680),
|
220 |
-
"Palmyrene": range(67680, 67712),
|
221 |
-
"Nabataean": range(67712, 67760),
|
222 |
-
"Hatran": range(67808, 67840),
|
223 |
-
"Phoenician": range(67840, 67872),
|
224 |
-
"Lydian": range(67872, 67904),
|
225 |
-
"Meroitic Hieroglyphs": range(67968, 68000),
|
226 |
-
"Meroitic Cursive": range(68000, 68096),
|
227 |
-
"Kharoshthi": range(68096, 68192),
|
228 |
-
"Old South Arabian": range(68192, 68224),
|
229 |
-
"Old North Arabian": range(68224, 68256),
|
230 |
-
"Manichaean": range(68288, 68352),
|
231 |
-
"Avestan": range(68352, 68416),
|
232 |
-
"Inscriptional Parthian": range(68416, 68448),
|
233 |
-
"Inscriptional Pahlavi": range(68448, 68480),
|
234 |
-
"Psalter Pahlavi": range(68480, 68528),
|
235 |
-
"Old Turkic": range(68608, 68688),
|
236 |
-
"Old Hungarian": range(68736, 68864),
|
237 |
-
"Hanifi Rohingya": range(68864, 68928),
|
238 |
-
"Rumi Numeral Symbols": range(69216, 69248),
|
239 |
-
"Yezidi": range(69248, 69312),
|
240 |
-
"Arabic Extended-C": range(69312, 69376),
|
241 |
-
"Old Sogdian": range(69376, 69424),
|
242 |
-
"Sogdian": range(69424, 69488),
|
243 |
-
"Old Uyghur": range(69488, 69552),
|
244 |
-
"Chorasmian": range(69552, 69600),
|
245 |
-
"Elymaic": range(69600, 69632),
|
246 |
-
"Brahmi": range(69632, 69760),
|
247 |
-
"Kaithi": range(69760, 69840),
|
248 |
-
"Sora Sompeng": range(69840, 69888),
|
249 |
-
"Chakma": range(69888, 69968),
|
250 |
-
"Mahajani": range(69968, 70016),
|
251 |
-
"Sharada": range(70016, 70112),
|
252 |
-
"Sinhala Archaic Numbers": range(70112, 70144),
|
253 |
-
"Khojki": range(70144, 70224),
|
254 |
-
"Multani": range(70272, 70320),
|
255 |
-
"Khudawadi": range(70320, 70400),
|
256 |
-
"Grantha": range(70400, 70528),
|
257 |
-
"Newa": range(70656, 70784),
|
258 |
-
"Tirhuta": range(70784, 70880),
|
259 |
-
"Siddham": range(71040, 71168),
|
260 |
-
"Modi": range(71168, 71264),
|
261 |
-
"Mongolian Supplement": range(71264, 71296),
|
262 |
-
"Takri": range(71296, 71376),
|
263 |
-
"Ahom": range(71424, 71504),
|
264 |
-
"Dogra": range(71680, 71760),
|
265 |
-
"Warang Citi": range(71840, 71936),
|
266 |
-
"Dives Akuru": range(71936, 72032),
|
267 |
-
"Nandinagari": range(72096, 72192),
|
268 |
-
"Zanabazar Square": range(72192, 72272),
|
269 |
-
"Soyombo": range(72272, 72368),
|
270 |
-
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
271 |
-
"Pau Cin Hau": range(72384, 72448),
|
272 |
-
"Devanagari Extended-A": range(72448, 72544),
|
273 |
-
"Bhaiksuki": range(72704, 72816),
|
274 |
-
"Marchen": range(72816, 72896),
|
275 |
-
"Masaram Gondi": range(72960, 73056),
|
276 |
-
"Gunjala Gondi": range(73056, 73136),
|
277 |
-
"Makasar": range(73440, 73472),
|
278 |
-
"Kawi": range(73472, 73568),
|
279 |
-
"Lisu Supplement": range(73648, 73664),
|
280 |
-
"Tamil Supplement": range(73664, 73728),
|
281 |
-
"Cuneiform": range(73728, 74752),
|
282 |
-
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
283 |
-
"Early Dynastic Cuneiform": range(74880, 75088),
|
284 |
-
"Cypro-Minoan": range(77712, 77824),
|
285 |
-
"Egyptian Hieroglyphs": range(77824, 78896),
|
286 |
-
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
287 |
-
"Anatolian Hieroglyphs": range(82944, 83584),
|
288 |
-
"Bamum Supplement": range(92160, 92736),
|
289 |
-
"Mro": range(92736, 92784),
|
290 |
-
"Tangsa": range(92784, 92880),
|
291 |
-
"Bassa Vah": range(92880, 92928),
|
292 |
-
"Pahawh Hmong": range(92928, 93072),
|
293 |
-
"Medefaidrin": range(93760, 93856),
|
294 |
-
"Miao": range(93952, 94112),
|
295 |
-
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
296 |
-
"Tangut": range(94208, 100352),
|
297 |
-
"Tangut Components": range(100352, 101120),
|
298 |
-
"Khitan Small Script": range(101120, 101632),
|
299 |
-
"Tangut Supplement": range(101632, 101760),
|
300 |
-
"Kana Extended-B": range(110576, 110592),
|
301 |
-
"Kana Supplement": range(110592, 110848),
|
302 |
-
"Kana Extended-A": range(110848, 110896),
|
303 |
-
"Small Kana Extension": range(110896, 110960),
|
304 |
-
"Nushu": range(110960, 111360),
|
305 |
-
"Duployan": range(113664, 113824),
|
306 |
-
"Shorthand Format Controls": range(113824, 113840),
|
307 |
-
"Znamenny Musical Notation": range(118528, 118736),
|
308 |
-
"Byzantine Musical Symbols": range(118784, 119040),
|
309 |
-
"Musical Symbols": range(119040, 119296),
|
310 |
-
"Ancient Greek Musical Notation": range(119296, 119376),
|
311 |
-
"Kaktovik Numerals": range(119488, 119520),
|
312 |
-
"Mayan Numerals": range(119520, 119552),
|
313 |
-
"Tai Xuan Jing Symbols": range(119552, 119648),
|
314 |
-
"Counting Rod Numerals": range(119648, 119680),
|
315 |
-
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
316 |
-
"Sutton SignWriting": range(120832, 121520),
|
317 |
-
"Latin Extended-G": range(122624, 122880),
|
318 |
-
"Glagolitic Supplement": range(122880, 122928),
|
319 |
-
"Cyrillic Extended-D": range(122928, 123024),
|
320 |
-
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
321 |
-
"Toto": range(123536, 123584),
|
322 |
-
"Wancho": range(123584, 123648),
|
323 |
-
"Nag Mundari": range(124112, 124160),
|
324 |
-
"Ethiopic Extended-B": range(124896, 124928),
|
325 |
-
"Mende Kikakui": range(124928, 125152),
|
326 |
-
"Adlam": range(125184, 125280),
|
327 |
-
"Indic Siyaq Numbers": range(126064, 126144),
|
328 |
-
"Ottoman Siyaq Numbers": range(126208, 126288),
|
329 |
-
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
330 |
-
"Mahjong Tiles": range(126976, 127024),
|
331 |
-
"Domino Tiles": range(127024, 127136),
|
332 |
-
"Playing Cards": range(127136, 127232),
|
333 |
-
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
334 |
-
"Enclosed Ideographic Supplement": range(127488, 127744),
|
335 |
-
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
336 |
-
"Emoticons range(Emoji)": range(128512, 128592),
|
337 |
-
"Ornamental Dingbats": range(128592, 128640),
|
338 |
-
"Transport and Map Symbols": range(128640, 128768),
|
339 |
-
"Alchemical Symbols": range(128768, 128896),
|
340 |
-
"Geometric Shapes Extended": range(128896, 129024),
|
341 |
-
"Supplemental Arrows-C": range(129024, 129280),
|
342 |
-
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
343 |
-
"Chess Symbols": range(129536, 129648),
|
344 |
-
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
345 |
-
"Symbols for Legacy Computing": range(129792, 130048),
|
346 |
-
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
347 |
-
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
348 |
-
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
349 |
-
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
350 |
-
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
351 |
-
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
352 |
-
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
353 |
-
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
354 |
-
"Tags": range(917504, 917632),
|
355 |
-
"Variation Selectors Supplement": range(917760, 918000),
|
356 |
-
"Supplementary Private Use Area-A": range(983040, 1048576),
|
357 |
-
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
358 |
-
}
|
359 |
-
|
360 |
-
|
361 |
-
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
|
362 |
-
"Supplement",
|
363 |
-
"Extended",
|
364 |
-
"Extensions",
|
365 |
-
"Modifier",
|
366 |
-
"Marks",
|
367 |
-
"Punctuation",
|
368 |
-
"Symbols",
|
369 |
-
"Forms",
|
370 |
-
"Operators",
|
371 |
-
"Miscellaneous",
|
372 |
-
"Drawing",
|
373 |
-
"Block",
|
374 |
-
"Shapes",
|
375 |
-
"Supplemental",
|
376 |
-
"Tags",
|
377 |
-
]
|
378 |
-
|
379 |
-
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
380 |
-
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
381 |
-
IGNORECASE,
|
382 |
-
)
|
383 |
-
|
384 |
-
IANA_NO_ALIASES = [
|
385 |
-
"cp720",
|
386 |
-
"cp737",
|
387 |
-
"cp856",
|
388 |
-
"cp874",
|
389 |
-
"cp875",
|
390 |
-
"cp1006",
|
391 |
-
"koi8_r",
|
392 |
-
"koi8_t",
|
393 |
-
"koi8_u",
|
394 |
-
]
|
395 |
-
|
396 |
-
IANA_SUPPORTED: list[str] = sorted(
|
397 |
-
filter(
|
398 |
-
lambda x: x.endswith("_codec") is False
|
399 |
-
and x not in {"rot_13", "tactis", "mbcs"},
|
400 |
-
list(set(aliases.values())) + IANA_NO_ALIASES,
|
401 |
-
)
|
402 |
-
)
|
403 |
-
|
404 |
-
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
405 |
-
|
406 |
-
# pre-computed code page that are similar using the function cp_similarity.
|
407 |
-
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
|
408 |
-
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
409 |
-
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
410 |
-
"cp1125": ["cp866"],
|
411 |
-
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
412 |
-
"cp1250": ["iso8859_2"],
|
413 |
-
"cp1251": ["kz1048", "ptcp154"],
|
414 |
-
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
415 |
-
"cp1253": ["iso8859_7"],
|
416 |
-
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
417 |
-
"cp1257": ["iso8859_13"],
|
418 |
-
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
419 |
-
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
420 |
-
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
421 |
-
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
422 |
-
"cp857": ["cp850", "cp858", "cp865"],
|
423 |
-
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
424 |
-
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
425 |
-
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
426 |
-
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
427 |
-
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
428 |
-
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
429 |
-
"cp866": ["cp1125"],
|
430 |
-
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
431 |
-
"iso8859_11": ["tis_620"],
|
432 |
-
"iso8859_13": ["cp1257"],
|
433 |
-
"iso8859_14": [
|
434 |
-
"iso8859_10",
|
435 |
-
"iso8859_15",
|
436 |
-
"iso8859_16",
|
437 |
-
"iso8859_3",
|
438 |
-
"iso8859_9",
|
439 |
-
"latin_1",
|
440 |
-
],
|
441 |
-
"iso8859_15": [
|
442 |
-
"cp1252",
|
443 |
-
"cp1254",
|
444 |
-
"iso8859_10",
|
445 |
-
"iso8859_14",
|
446 |
-
"iso8859_16",
|
447 |
-
"iso8859_3",
|
448 |
-
"iso8859_9",
|
449 |
-
"latin_1",
|
450 |
-
],
|
451 |
-
"iso8859_16": [
|
452 |
-
"iso8859_14",
|
453 |
-
"iso8859_15",
|
454 |
-
"iso8859_2",
|
455 |
-
"iso8859_3",
|
456 |
-
"iso8859_9",
|
457 |
-
"latin_1",
|
458 |
-
],
|
459 |
-
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
460 |
-
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
461 |
-
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
462 |
-
"iso8859_7": ["cp1253"],
|
463 |
-
"iso8859_9": [
|
464 |
-
"cp1252",
|
465 |
-
"cp1254",
|
466 |
-
"cp1258",
|
467 |
-
"iso8859_10",
|
468 |
-
"iso8859_14",
|
469 |
-
"iso8859_15",
|
470 |
-
"iso8859_16",
|
471 |
-
"iso8859_3",
|
472 |
-
"iso8859_4",
|
473 |
-
"latin_1",
|
474 |
-
],
|
475 |
-
"kz1048": ["cp1251", "ptcp154"],
|
476 |
-
"latin_1": [
|
477 |
-
"cp1252",
|
478 |
-
"cp1254",
|
479 |
-
"cp1258",
|
480 |
-
"iso8859_10",
|
481 |
-
"iso8859_14",
|
482 |
-
"iso8859_15",
|
483 |
-
"iso8859_16",
|
484 |
-
"iso8859_3",
|
485 |
-
"iso8859_4",
|
486 |
-
"iso8859_9",
|
487 |
-
],
|
488 |
-
"mac_iceland": ["mac_roman", "mac_turkish"],
|
489 |
-
"mac_roman": ["mac_iceland", "mac_turkish"],
|
490 |
-
"mac_turkish": ["mac_iceland", "mac_roman"],
|
491 |
-
"ptcp154": ["cp1251", "kz1048"],
|
492 |
-
"tis_620": ["iso8859_11"],
|
493 |
-
}
|
494 |
-
|
495 |
-
|
496 |
-
CHARDET_CORRESPONDENCE: dict[str, str] = {
|
497 |
-
"iso2022_kr": "ISO-2022-KR",
|
498 |
-
"iso2022_jp": "ISO-2022-JP",
|
499 |
-
"euc_kr": "EUC-KR",
|
500 |
-
"tis_620": "TIS-620",
|
501 |
-
"utf_32": "UTF-32",
|
502 |
-
"euc_jp": "EUC-JP",
|
503 |
-
"koi8_r": "KOI8-R",
|
504 |
-
"iso8859_1": "ISO-8859-1",
|
505 |
-
"iso8859_2": "ISO-8859-2",
|
506 |
-
"iso8859_5": "ISO-8859-5",
|
507 |
-
"iso8859_6": "ISO-8859-6",
|
508 |
-
"iso8859_7": "ISO-8859-7",
|
509 |
-
"iso8859_8": "ISO-8859-8",
|
510 |
-
"utf_16": "UTF-16",
|
511 |
-
"cp855": "IBM855",
|
512 |
-
"mac_cyrillic": "MacCyrillic",
|
513 |
-
"gb2312": "GB2312",
|
514 |
-
"gb18030": "GB18030",
|
515 |
-
"cp932": "CP932",
|
516 |
-
"cp866": "IBM866",
|
517 |
-
"utf_8": "utf-8",
|
518 |
-
"utf_8_sig": "UTF-8-SIG",
|
519 |
-
"shift_jis": "SHIFT_JIS",
|
520 |
-
"big5": "Big5",
|
521 |
-
"cp1250": "windows-1250",
|
522 |
-
"cp1251": "windows-1251",
|
523 |
-
"cp1252": "Windows-1252",
|
524 |
-
"cp1253": "windows-1253",
|
525 |
-
"cp1255": "windows-1255",
|
526 |
-
"cp1256": "windows-1256",
|
527 |
-
"cp1254": "Windows-1254",
|
528 |
-
"cp949": "CP949",
|
529 |
-
}
|
530 |
-
|
531 |
-
|
532 |
-
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
|
533 |
-
"<",
|
534 |
-
">",
|
535 |
-
"=",
|
536 |
-
":",
|
537 |
-
"/",
|
538 |
-
"&",
|
539 |
-
";",
|
540 |
-
"{",
|
541 |
-
"}",
|
542 |
-
"[",
|
543 |
-
"]",
|
544 |
-
",",
|
545 |
-
"|",
|
546 |
-
'"',
|
547 |
-
"-",
|
548 |
-
"(",
|
549 |
-
")",
|
550 |
-
}
|
551 |
-
|
552 |
-
|
553 |
-
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
|
554 |
-
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
555 |
-
|
556 |
-
# Logging LEVEL below DEBUG
|
557 |
-
TRACE: int = 5
|
558 |
-
|
559 |
-
|
560 |
-
# Language label that contain the em dash "—"
|
561 |
-
# character are to be considered alternative seq to origin
|
562 |
-
FREQUENCIES: dict[str, list[str]] = {
|
563 |
-
"English": [
|
564 |
-
"e",
|
565 |
-
"a",
|
566 |
-
"t",
|
567 |
-
"i",
|
568 |
-
"o",
|
569 |
-
"n",
|
570 |
-
"s",
|
571 |
-
"r",
|
572 |
-
"h",
|
573 |
-
"l",
|
574 |
-
"d",
|
575 |
-
"c",
|
576 |
-
"u",
|
577 |
-
"m",
|
578 |
-
"f",
|
579 |
-
"p",
|
580 |
-
"g",
|
581 |
-
"w",
|
582 |
-
"y",
|
583 |
-
"b",
|
584 |
-
"v",
|
585 |
-
"k",
|
586 |
-
"x",
|
587 |
-
"j",
|
588 |
-
"z",
|
589 |
-
"q",
|
590 |
-
],
|
591 |
-
"English—": [
|
592 |
-
"e",
|
593 |
-
"a",
|
594 |
-
"t",
|
595 |
-
"i",
|
596 |
-
"o",
|
597 |
-
"n",
|
598 |
-
"s",
|
599 |
-
"r",
|
600 |
-
"h",
|
601 |
-
"l",
|
602 |
-
"d",
|
603 |
-
"c",
|
604 |
-
"m",
|
605 |
-
"u",
|
606 |
-
"f",
|
607 |
-
"p",
|
608 |
-
"g",
|
609 |
-
"w",
|
610 |
-
"b",
|
611 |
-
"y",
|
612 |
-
"v",
|
613 |
-
"k",
|
614 |
-
"j",
|
615 |
-
"x",
|
616 |
-
"z",
|
617 |
-
"q",
|
618 |
-
],
|
619 |
-
"German": [
|
620 |
-
"e",
|
621 |
-
"n",
|
622 |
-
"i",
|
623 |
-
"r",
|
624 |
-
"s",
|
625 |
-
"t",
|
626 |
-
"a",
|
627 |
-
"d",
|
628 |
-
"h",
|
629 |
-
"u",
|
630 |
-
"l",
|
631 |
-
"g",
|
632 |
-
"o",
|
633 |
-
"c",
|
634 |
-
"m",
|
635 |
-
"b",
|
636 |
-
"f",
|
637 |
-
"k",
|
638 |
-
"w",
|
639 |
-
"z",
|
640 |
-
"p",
|
641 |
-
"v",
|
642 |
-
"ü",
|
643 |
-
"ä",
|
644 |
-
"ö",
|
645 |
-
"j",
|
646 |
-
],
|
647 |
-
"French": [
|
648 |
-
"e",
|
649 |
-
"a",
|
650 |
-
"s",
|
651 |
-
"n",
|
652 |
-
"i",
|
653 |
-
"t",
|
654 |
-
"r",
|
655 |
-
"l",
|
656 |
-
"u",
|
657 |
-
"o",
|
658 |
-
"d",
|
659 |
-
"c",
|
660 |
-
"p",
|
661 |
-
"m",
|
662 |
-
"é",
|
663 |
-
"v",
|
664 |
-
"g",
|
665 |
-
"f",
|
666 |
-
"b",
|
667 |
-
"h",
|
668 |
-
"q",
|
669 |
-
"à",
|
670 |
-
"x",
|
671 |
-
"è",
|
672 |
-
"y",
|
673 |
-
"j",
|
674 |
-
],
|
675 |
-
"Dutch": [
|
676 |
-
"e",
|
677 |
-
"n",
|
678 |
-
"a",
|
679 |
-
"i",
|
680 |
-
"r",
|
681 |
-
"t",
|
682 |
-
"o",
|
683 |
-
"d",
|
684 |
-
"s",
|
685 |
-
"l",
|
686 |
-
"g",
|
687 |
-
"h",
|
688 |
-
"v",
|
689 |
-
"m",
|
690 |
-
"u",
|
691 |
-
"k",
|
692 |
-
"c",
|
693 |
-
"p",
|
694 |
-
"b",
|
695 |
-
"w",
|
696 |
-
"j",
|
697 |
-
"z",
|
698 |
-
"f",
|
699 |
-
"y",
|
700 |
-
"x",
|
701 |
-
"ë",
|
702 |
-
],
|
703 |
-
"Italian": [
|
704 |
-
"e",
|
705 |
-
"i",
|
706 |
-
"a",
|
707 |
-
"o",
|
708 |
-
"n",
|
709 |
-
"l",
|
710 |
-
"t",
|
711 |
-
"r",
|
712 |
-
"s",
|
713 |
-
"c",
|
714 |
-
"d",
|
715 |
-
"u",
|
716 |
-
"p",
|
717 |
-
"m",
|
718 |
-
"g",
|
719 |
-
"v",
|
720 |
-
"f",
|
721 |
-
"b",
|
722 |
-
"z",
|
723 |
-
"h",
|
724 |
-
"q",
|
725 |
-
"è",
|
726 |
-
"à",
|
727 |
-
"k",
|
728 |
-
"y",
|
729 |
-
"ò",
|
730 |
-
],
|
731 |
-
"Polish": [
|
732 |
-
"a",
|
733 |
-
"i",
|
734 |
-
"o",
|
735 |
-
"e",
|
736 |
-
"n",
|
737 |
-
"r",
|
738 |
-
"z",
|
739 |
-
"w",
|
740 |
-
"s",
|
741 |
-
"c",
|
742 |
-
"t",
|
743 |
-
"k",
|
744 |
-
"y",
|
745 |
-
"d",
|
746 |
-
"p",
|
747 |
-
"m",
|
748 |
-
"u",
|
749 |
-
"l",
|
750 |
-
"j",
|
751 |
-
"ł",
|
752 |
-
"g",
|
753 |
-
"b",
|
754 |
-
"h",
|
755 |
-
"ą",
|
756 |
-
"ę",
|
757 |
-
"ó",
|
758 |
-
],
|
759 |
-
"Spanish": [
|
760 |
-
"e",
|
761 |
-
"a",
|
762 |
-
"o",
|
763 |
-
"n",
|
764 |
-
"s",
|
765 |
-
"r",
|
766 |
-
"i",
|
767 |
-
"l",
|
768 |
-
"d",
|
769 |
-
"t",
|
770 |
-
"c",
|
771 |
-
"u",
|
772 |
-
"m",
|
773 |
-
"p",
|
774 |
-
"b",
|
775 |
-
"g",
|
776 |
-
"v",
|
777 |
-
"f",
|
778 |
-
"y",
|
779 |
-
"ó",
|
780 |
-
"h",
|
781 |
-
"q",
|
782 |
-
"í",
|
783 |
-
"j",
|
784 |
-
"z",
|
785 |
-
"á",
|
786 |
-
],
|
787 |
-
"Russian": [
|
788 |
-
"о",
|
789 |
-
"а",
|
790 |
-
"е",
|
791 |
-
"и",
|
792 |
-
"н",
|
793 |
-
"с",
|
794 |
-
"т",
|
795 |
-
"р",
|
796 |
-
"в",
|
797 |
-
"л",
|
798 |
-
"к",
|
799 |
-
"м",
|
800 |
-
"д",
|
801 |
-
"п",
|
802 |
-
"у",
|
803 |
-
"г",
|
804 |
-
"я",
|
805 |
-
"ы",
|
806 |
-
"з",
|
807 |
-
"б",
|
808 |
-
"й",
|
809 |
-
"ь",
|
810 |
-
"ч",
|
811 |
-
"х",
|
812 |
-
"ж",
|
813 |
-
"ц",
|
814 |
-
],
|
815 |
-
# Jap-Kanji
|
816 |
-
"Japanese": [
|
817 |
-
"人",
|
818 |
-
"一",
|
819 |
-
"大",
|
820 |
-
"亅",
|
821 |
-
"丁",
|
822 |
-
"丨",
|
823 |
-
"竹",
|
824 |
-
"笑",
|
825 |
-
"口",
|
826 |
-
"日",
|
827 |
-
"今",
|
828 |
-
"二",
|
829 |
-
"彳",
|
830 |
-
"行",
|
831 |
-
"十",
|
832 |
-
"土",
|
833 |
-
"丶",
|
834 |
-
"寸",
|
835 |
-
"寺",
|
836 |
-
"時",
|
837 |
-
"乙",
|
838 |
-
"丿",
|
839 |
-
"乂",
|
840 |
-
"气",
|
841 |
-
"気",
|
842 |
-
"冂",
|
843 |
-
"巾",
|
844 |
-
"亠",
|
845 |
-
"市",
|
846 |
-
"目",
|
847 |
-
"儿",
|
848 |
-
"見",
|
849 |
-
"八",
|
850 |
-
"小",
|
851 |
-
"凵",
|
852 |
-
"県",
|
853 |
-
"月",
|
854 |
-
"彐",
|
855 |
-
"門",
|
856 |
-
"間",
|
857 |
-
"木",
|
858 |
-
"東",
|
859 |
-
"山",
|
860 |
-
"出",
|
861 |
-
"本",
|
862 |
-
"中",
|
863 |
-
"刀",
|
864 |
-
"分",
|
865 |
-
"耳",
|
866 |
-
"又",
|
867 |
-
"取",
|
868 |
-
"最",
|
869 |
-
"言",
|
870 |
-
"田",
|
871 |
-
"心",
|
872 |
-
"思",
|
873 |
-
"刂",
|
874 |
-
"前",
|
875 |
-
"京",
|
876 |
-
"尹",
|
877 |
-
"事",
|
878 |
-
"生",
|
879 |
-
"厶",
|
880 |
-
"云",
|
881 |
-
"会",
|
882 |
-
"未",
|
883 |
-
"来",
|
884 |
-
"白",
|
885 |
-
"冫",
|
886 |
-
"楽",
|
887 |
-
"灬",
|
888 |
-
"馬",
|
889 |
-
"尸",
|
890 |
-
"尺",
|
891 |
-
"駅",
|
892 |
-
"明",
|
893 |
-
"耂",
|
894 |
-
"者",
|
895 |
-
"了",
|
896 |
-
"阝",
|
897 |
-
"都",
|
898 |
-
"高",
|
899 |
-
"卜",
|
900 |
-
"占",
|
901 |
-
"厂",
|
902 |
-
"广",
|
903 |
-
"店",
|
904 |
-
"子",
|
905 |
-
"申",
|
906 |
-
"奄",
|
907 |
-
"亻",
|
908 |
-
"俺",
|
909 |
-
"上",
|
910 |
-
"方",
|
911 |
-
"冖",
|
912 |
-
"学",
|
913 |
-
"衣",
|
914 |
-
"艮",
|
915 |
-
"食",
|
916 |
-
"自",
|
917 |
-
],
|
918 |
-
# Jap-Katakana
|
919 |
-
"Japanese—": [
|
920 |
-
"ー",
|
921 |
-
"ン",
|
922 |
-
"ス",
|
923 |
-
"・",
|
924 |
-
"ル",
|
925 |
-
"ト",
|
926 |
-
"リ",
|
927 |
-
"イ",
|
928 |
-
"ア",
|
929 |
-
"ラ",
|
930 |
-
"ッ",
|
931 |
-
"ク",
|
932 |
-
"ド",
|
933 |
-
"シ",
|
934 |
-
"レ",
|
935 |
-
"ジ",
|
936 |
-
"タ",
|
937 |
-
"フ",
|
938 |
-
"ロ",
|
939 |
-
"カ",
|
940 |
-
"テ",
|
941 |
-
"マ",
|
942 |
-
"ィ",
|
943 |
-
"グ",
|
944 |
-
"バ",
|
945 |
-
"ム",
|
946 |
-
"プ",
|
947 |
-
"オ",
|
948 |
-
"コ",
|
949 |
-
"デ",
|
950 |
-
"ニ",
|
951 |
-
"ウ",
|
952 |
-
"メ",
|
953 |
-
"サ",
|
954 |
-
"ビ",
|
955 |
-
"ナ",
|
956 |
-
"ブ",
|
957 |
-
"ャ",
|
958 |
-
"エ",
|
959 |
-
"ュ",
|
960 |
-
"チ",
|
961 |
-
"キ",
|
962 |
-
"ズ",
|
963 |
-
"ダ",
|
964 |
-
"パ",
|
965 |
-
"ミ",
|
966 |
-
"ェ",
|
967 |
-
"ョ",
|
968 |
-
"ハ",
|
969 |
-
"セ",
|
970 |
-
"ベ",
|
971 |
-
"ガ",
|
972 |
-
"モ",
|
973 |
-
"ツ",
|
974 |
-
"ネ",
|
975 |
-
"ボ",
|
976 |
-
"ソ",
|
977 |
-
"ノ",
|
978 |
-
"ァ",
|
979 |
-
"ヴ",
|
980 |
-
"ワ",
|
981 |
-
"ポ",
|
982 |
-
"ペ",
|
983 |
-
"ピ",
|
984 |
-
"ケ",
|
985 |
-
"ゴ",
|
986 |
-
"ギ",
|
987 |
-
"ザ",
|
988 |
-
"ホ",
|
989 |
-
"ゲ",
|
990 |
-
"ォ",
|
991 |
-
"ヤ",
|
992 |
-
"ヒ",
|
993 |
-
"ユ",
|
994 |
-
"ヨ",
|
995 |
-
"ヘ",
|
996 |
-
"ゼ",
|
997 |
-
"ヌ",
|
998 |
-
"ゥ",
|
999 |
-
"ゾ",
|
1000 |
-
"ヶ",
|
1001 |
-
"ヂ",
|
1002 |
-
"ヲ",
|
1003 |
-
"ヅ",
|
1004 |
-
"ヵ",
|
1005 |
-
"ヱ",
|
1006 |
-
"ヰ",
|
1007 |
-
"ヮ",
|
1008 |
-
"ヽ",
|
1009 |
-
"゠",
|
1010 |
-
"ヾ",
|
1011 |
-
"ヷ",
|
1012 |
-
"ヿ",
|
1013 |
-
"ヸ",
|
1014 |
-
"ヹ",
|
1015 |
-
"ヺ",
|
1016 |
-
],
|
1017 |
-
# Jap-Hiragana
|
1018 |
-
"Japanese——": [
|
1019 |
-
"の",
|
1020 |
-
"に",
|
1021 |
-
"る",
|
1022 |
-
"た",
|
1023 |
-
"と",
|
1024 |
-
"は",
|
1025 |
-
"し",
|
1026 |
-
"い",
|
1027 |
-
"を",
|
1028 |
-
"で",
|
1029 |
-
"て",
|
1030 |
-
"が",
|
1031 |
-
"な",
|
1032 |
-
"れ",
|
1033 |
-
"か",
|
1034 |
-
"ら",
|
1035 |
-
"さ",
|
1036 |
-
"っ",
|
1037 |
-
"り",
|
1038 |
-
"す",
|
1039 |
-
"あ",
|
1040 |
-
"も",
|
1041 |
-
"こ",
|
1042 |
-
"ま",
|
1043 |
-
"う",
|
1044 |
-
"く",
|
1045 |
-
"よ",
|
1046 |
-
"き",
|
1047 |
-
"ん",
|
1048 |
-
"め",
|
1049 |
-
"お",
|
1050 |
-
"け",
|
1051 |
-
"そ",
|
1052 |
-
"つ",
|
1053 |
-
"だ",
|
1054 |
-
"や",
|
1055 |
-
"え",
|
1056 |
-
"ど",
|
1057 |
-
"わ",
|
1058 |
-
"ち",
|
1059 |
-
"み",
|
1060 |
-
"せ",
|
1061 |
-
"じ",
|
1062 |
-
"ば",
|
1063 |
-
"へ",
|
1064 |
-
"び",
|
1065 |
-
"ず",
|
1066 |
-
"ろ",
|
1067 |
-
"ほ",
|
1068 |
-
"げ",
|
1069 |
-
"む",
|
1070 |
-
"べ",
|
1071 |
-
"ひ",
|
1072 |
-
"ょ",
|
1073 |
-
"ゆ",
|
1074 |
-
"ぶ",
|
1075 |
-
"ご",
|
1076 |
-
"ゃ",
|
1077 |
-
"ね",
|
1078 |
-
"ふ",
|
1079 |
-
"ぐ",
|
1080 |
-
"ぎ",
|
1081 |
-
"ぼ",
|
1082 |
-
"ゅ",
|
1083 |
-
"づ",
|
1084 |
-
"ざ",
|
1085 |
-
"ぞ",
|
1086 |
-
"ぬ",
|
1087 |
-
"ぜ",
|
1088 |
-
"ぱ",
|
1089 |
-
"ぽ",
|
1090 |
-
"ぷ",
|
1091 |
-
"ぴ",
|
1092 |
-
"ぃ",
|
1093 |
-
"ぁ",
|
1094 |
-
"ぇ",
|
1095 |
-
"ぺ",
|
1096 |
-
"ゞ",
|
1097 |
-
"ぢ",
|
1098 |
-
"ぉ",
|
1099 |
-
"ぅ",
|
1100 |
-
"ゐ",
|
1101 |
-
"ゝ",
|
1102 |
-
"ゑ",
|
1103 |
-
"゛",
|
1104 |
-
"゜",
|
1105 |
-
"ゎ",
|
1106 |
-
"ゔ",
|
1107 |
-
"゚",
|
1108 |
-
"ゟ",
|
1109 |
-
"゙",
|
1110 |
-
"ゕ",
|
1111 |
-
"ゖ",
|
1112 |
-
],
|
1113 |
-
"Portuguese": [
|
1114 |
-
"a",
|
1115 |
-
"e",
|
1116 |
-
"o",
|
1117 |
-
"s",
|
1118 |
-
"i",
|
1119 |
-
"r",
|
1120 |
-
"d",
|
1121 |
-
"n",
|
1122 |
-
"t",
|
1123 |
-
"m",
|
1124 |
-
"u",
|
1125 |
-
"c",
|
1126 |
-
"l",
|
1127 |
-
"p",
|
1128 |
-
"g",
|
1129 |
-
"v",
|
1130 |
-
"b",
|
1131 |
-
"f",
|
1132 |
-
"h",
|
1133 |
-
"ã",
|
1134 |
-
"q",
|
1135 |
-
"é",
|
1136 |
-
"ç",
|
1137 |
-
"á",
|
1138 |
-
"z",
|
1139 |
-
"í",
|
1140 |
-
],
|
1141 |
-
"Swedish": [
|
1142 |
-
"e",
|
1143 |
-
"a",
|
1144 |
-
"n",
|
1145 |
-
"r",
|
1146 |
-
"t",
|
1147 |
-
"s",
|
1148 |
-
"i",
|
1149 |
-
"l",
|
1150 |
-
"d",
|
1151 |
-
"o",
|
1152 |
-
"m",
|
1153 |
-
"k",
|
1154 |
-
"g",
|
1155 |
-
"v",
|
1156 |
-
"h",
|
1157 |
-
"f",
|
1158 |
-
"u",
|
1159 |
-
"p",
|
1160 |
-
"ä",
|
1161 |
-
"c",
|
1162 |
-
"b",
|
1163 |
-
"ö",
|
1164 |
-
"å",
|
1165 |
-
"y",
|
1166 |
-
"j",
|
1167 |
-
"x",
|
1168 |
-
],
|
1169 |
-
"Chinese": [
|
1170 |
-
"的",
|
1171 |
-
"一",
|
1172 |
-
"是",
|
1173 |
-
"不",
|
1174 |
-
"了",
|
1175 |
-
"在",
|
1176 |
-
"人",
|
1177 |
-
"有",
|
1178 |
-
"我",
|
1179 |
-
"他",
|
1180 |
-
"这",
|
1181 |
-
"个",
|
1182 |
-
"们",
|
1183 |
-
"中",
|
1184 |
-
"来",
|
1185 |
-
"上",
|
1186 |
-
"大",
|
1187 |
-
"为",
|
1188 |
-
"和",
|
1189 |
-
"国",
|
1190 |
-
"地",
|
1191 |
-
"到",
|
1192 |
-
"以",
|
1193 |
-
"说",
|
1194 |
-
"时",
|
1195 |
-
"要",
|
1196 |
-
"就",
|
1197 |
-
"出",
|
1198 |
-
"会",
|
1199 |
-
"可",
|
1200 |
-
"也",
|
1201 |
-
"你",
|
1202 |
-
"对",
|
1203 |
-
"生",
|
1204 |
-
"能",
|
1205 |
-
"而",
|
1206 |
-
"子",
|
1207 |
-
"那",
|
1208 |
-
"得",
|
1209 |
-
"于",
|
1210 |
-
"着",
|
1211 |
-
"下",
|
1212 |
-
"自",
|
1213 |
-
"之",
|
1214 |
-
"年",
|
1215 |
-
"过",
|
1216 |
-
"发",
|
1217 |
-
"后",
|
1218 |
-
"作",
|
1219 |
-
"里",
|
1220 |
-
"用",
|
1221 |
-
"道",
|
1222 |
-
"行",
|
1223 |
-
"所",
|
1224 |
-
"然",
|
1225 |
-
"家",
|
1226 |
-
"种",
|
1227 |
-
"事",
|
1228 |
-
"成",
|
1229 |
-
"方",
|
1230 |
-
"多",
|
1231 |
-
"经",
|
1232 |
-
"么",
|
1233 |
-
"去",
|
1234 |
-
"法",
|
1235 |
-
"学",
|
1236 |
-
"如",
|
1237 |
-
"都",
|
1238 |
-
"同",
|
1239 |
-
"现",
|
1240 |
-
"当",
|
1241 |
-
"没",
|
1242 |
-
"动",
|
1243 |
-
"面",
|
1244 |
-
"起",
|
1245 |
-
"看",
|
1246 |
-
"定",
|
1247 |
-
"天",
|
1248 |
-
"分",
|
1249 |
-
"还",
|
1250 |
-
"进",
|
1251 |
-
"好",
|
1252 |
-
"小",
|
1253 |
-
"部",
|
1254 |
-
"其",
|
1255 |
-
"些",
|
1256 |
-
"主",
|
1257 |
-
"样",
|
1258 |
-
"理",
|
1259 |
-
"心",
|
1260 |
-
"她",
|
1261 |
-
"本",
|
1262 |
-
"前",
|
1263 |
-
"开",
|
1264 |
-
"但",
|
1265 |
-
"因",
|
1266 |
-
"只",
|
1267 |
-
"从",
|
1268 |
-
"想",
|
1269 |
-
"实",
|
1270 |
-
],
|
1271 |
-
"Ukrainian": [
|
1272 |
-
"о",
|
1273 |
-
"а",
|
1274 |
-
"н",
|
1275 |
-
"і",
|
1276 |
-
"и",
|
1277 |
-
"р",
|
1278 |
-
"в",
|
1279 |
-
"т",
|
1280 |
-
"е",
|
1281 |
-
"с",
|
1282 |
-
"к",
|
1283 |
-
"л",
|
1284 |
-
"у",
|
1285 |
-
"д",
|
1286 |
-
"м",
|
1287 |
-
"п",
|
1288 |
-
"з",
|
1289 |
-
"я",
|
1290 |
-
"ь",
|
1291 |
-
"б",
|
1292 |
-
"г",
|
1293 |
-
"й",
|
1294 |
-
"ч",
|
1295 |
-
"х",
|
1296 |
-
"ц",
|
1297 |
-
"ї",
|
1298 |
-
],
|
1299 |
-
"Norwegian": [
|
1300 |
-
"e",
|
1301 |
-
"r",
|
1302 |
-
"n",
|
1303 |
-
"t",
|
1304 |
-
"a",
|
1305 |
-
"s",
|
1306 |
-
"i",
|
1307 |
-
"o",
|
1308 |
-
"l",
|
1309 |
-
"d",
|
1310 |
-
"g",
|
1311 |
-
"k",
|
1312 |
-
"m",
|
1313 |
-
"v",
|
1314 |
-
"f",
|
1315 |
-
"p",
|
1316 |
-
"u",
|
1317 |
-
"b",
|
1318 |
-
"h",
|
1319 |
-
"å",
|
1320 |
-
"y",
|
1321 |
-
"j",
|
1322 |
-
"ø",
|
1323 |
-
"c",
|
1324 |
-
"æ",
|
1325 |
-
"w",
|
1326 |
-
],
|
1327 |
-
"Finnish": [
|
1328 |
-
"a",
|
1329 |
-
"i",
|
1330 |
-
"n",
|
1331 |
-
"t",
|
1332 |
-
"e",
|
1333 |
-
"s",
|
1334 |
-
"l",
|
1335 |
-
"o",
|
1336 |
-
"u",
|
1337 |
-
"k",
|
1338 |
-
"ä",
|
1339 |
-
"m",
|
1340 |
-
"r",
|
1341 |
-
"v",
|
1342 |
-
"j",
|
1343 |
-
"h",
|
1344 |
-
"p",
|
1345 |
-
"y",
|
1346 |
-
"d",
|
1347 |
-
"ö",
|
1348 |
-
"g",
|
1349 |
-
"c",
|
1350 |
-
"b",
|
1351 |
-
"f",
|
1352 |
-
"w",
|
1353 |
-
"z",
|
1354 |
-
],
|
1355 |
-
"Vietnamese": [
|
1356 |
-
"n",
|
1357 |
-
"h",
|
1358 |
-
"t",
|
1359 |
-
"i",
|
1360 |
-
"c",
|
1361 |
-
"g",
|
1362 |
-
"a",
|
1363 |
-
"o",
|
1364 |
-
"u",
|
1365 |
-
"m",
|
1366 |
-
"l",
|
1367 |
-
"r",
|
1368 |
-
"à",
|
1369 |
-
"đ",
|
1370 |
-
"s",
|
1371 |
-
"e",
|
1372 |
-
"v",
|
1373 |
-
"p",
|
1374 |
-
"b",
|
1375 |
-
"y",
|
1376 |
-
"ư",
|
1377 |
-
"d",
|
1378 |
-
"á",
|
1379 |
-
"k",
|
1380 |
-
"ộ",
|
1381 |
-
"ế",
|
1382 |
-
],
|
1383 |
-
"Czech": [
|
1384 |
-
"o",
|
1385 |
-
"e",
|
1386 |
-
"a",
|
1387 |
-
"n",
|
1388 |
-
"t",
|
1389 |
-
"s",
|
1390 |
-
"i",
|
1391 |
-
"l",
|
1392 |
-
"v",
|
1393 |
-
"r",
|
1394 |
-
"k",
|
1395 |
-
"d",
|
1396 |
-
"u",
|
1397 |
-
"m",
|
1398 |
-
"p",
|
1399 |
-
"í",
|
1400 |
-
"c",
|
1401 |
-
"h",
|
1402 |
-
"z",
|
1403 |
-
"á",
|
1404 |
-
"y",
|
1405 |
-
"j",
|
1406 |
-
"b",
|
1407 |
-
"ě",
|
1408 |
-
"é",
|
1409 |
-
"ř",
|
1410 |
-
],
|
1411 |
-
"Hungarian": [
|
1412 |
-
"e",
|
1413 |
-
"a",
|
1414 |
-
"t",
|
1415 |
-
"l",
|
1416 |
-
"s",
|
1417 |
-
"n",
|
1418 |
-
"k",
|
1419 |
-
"r",
|
1420 |
-
"i",
|
1421 |
-
"o",
|
1422 |
-
"z",
|
1423 |
-
"á",
|
1424 |
-
"é",
|
1425 |
-
"g",
|
1426 |
-
"m",
|
1427 |
-
"b",
|
1428 |
-
"y",
|
1429 |
-
"v",
|
1430 |
-
"d",
|
1431 |
-
"h",
|
1432 |
-
"u",
|
1433 |
-
"p",
|
1434 |
-
"j",
|
1435 |
-
"ö",
|
1436 |
-
"f",
|
1437 |
-
"c",
|
1438 |
-
],
|
1439 |
-
"Korean": [
|
1440 |
-
"이",
|
1441 |
-
"다",
|
1442 |
-
"에",
|
1443 |
-
"의",
|
1444 |
-
"는",
|
1445 |
-
"로",
|
1446 |
-
"하",
|
1447 |
-
"을",
|
1448 |
-
"가",
|
1449 |
-
"고",
|
1450 |
-
"지",
|
1451 |
-
"서",
|
1452 |
-
"한",
|
1453 |
-
"은",
|
1454 |
-
"기",
|
1455 |
-
"으",
|
1456 |
-
"년",
|
1457 |
-
"대",
|
1458 |
-
"사",
|
1459 |
-
"시",
|
1460 |
-
"를",
|
1461 |
-
"리",
|
1462 |
-
"도",
|
1463 |
-
"인",
|
1464 |
-
"스",
|
1465 |
-
"일",
|
1466 |
-
],
|
1467 |
-
"Indonesian": [
|
1468 |
-
"a",
|
1469 |
-
"n",
|
1470 |
-
"e",
|
1471 |
-
"i",
|
1472 |
-
"r",
|
1473 |
-
"t",
|
1474 |
-
"u",
|
1475 |
-
"s",
|
1476 |
-
"d",
|
1477 |
-
"k",
|
1478 |
-
"m",
|
1479 |
-
"l",
|
1480 |
-
"g",
|
1481 |
-
"p",
|
1482 |
-
"b",
|
1483 |
-
"o",
|
1484 |
-
"h",
|
1485 |
-
"y",
|
1486 |
-
"j",
|
1487 |
-
"c",
|
1488 |
-
"w",
|
1489 |
-
"f",
|
1490 |
-
"v",
|
1491 |
-
"z",
|
1492 |
-
"x",
|
1493 |
-
"q",
|
1494 |
-
],
|
1495 |
-
"Turkish": [
|
1496 |
-
"a",
|
1497 |
-
"e",
|
1498 |
-
"i",
|
1499 |
-
"n",
|
1500 |
-
"r",
|
1501 |
-
"l",
|
1502 |
-
"ı",
|
1503 |
-
"k",
|
1504 |
-
"d",
|
1505 |
-
"t",
|
1506 |
-
"s",
|
1507 |
-
"m",
|
1508 |
-
"y",
|
1509 |
-
"u",
|
1510 |
-
"o",
|
1511 |
-
"b",
|
1512 |
-
"ü",
|
1513 |
-
"ş",
|
1514 |
-
"v",
|
1515 |
-
"g",
|
1516 |
-
"z",
|
1517 |
-
"h",
|
1518 |
-
"c",
|
1519 |
-
"p",
|
1520 |
-
"ç",
|
1521 |
-
"ğ",
|
1522 |
-
],
|
1523 |
-
"Romanian": [
|
1524 |
-
"e",
|
1525 |
-
"i",
|
1526 |
-
"a",
|
1527 |
-
"r",
|
1528 |
-
"n",
|
1529 |
-
"t",
|
1530 |
-
"u",
|
1531 |
-
"l",
|
1532 |
-
"o",
|
1533 |
-
"c",
|
1534 |
-
"s",
|
1535 |
-
"d",
|
1536 |
-
"p",
|
1537 |
-
"m",
|
1538 |
-
"ă",
|
1539 |
-
"f",
|
1540 |
-
"v",
|
1541 |
-
"î",
|
1542 |
-
"g",
|
1543 |
-
"b",
|
1544 |
-
"ș",
|
1545 |
-
"ț",
|
1546 |
-
"z",
|
1547 |
-
"h",
|
1548 |
-
"â",
|
1549 |
-
"j",
|
1550 |
-
],
|
1551 |
-
"Farsi": [
|
1552 |
-
"ا",
|
1553 |
-
"ی",
|
1554 |
-
"ر",
|
1555 |
-
"د",
|
1556 |
-
"ن",
|
1557 |
-
"ه",
|
1558 |
-
"و",
|
1559 |
-
"م",
|
1560 |
-
"ت",
|
1561 |
-
"ب",
|
1562 |
-
"س",
|
1563 |
-
"ل",
|
1564 |
-
"ک",
|
1565 |
-
"ش",
|
1566 |
-
"ز",
|
1567 |
-
"ف",
|
1568 |
-
"گ",
|
1569 |
-
"ع",
|
1570 |
-
"خ",
|
1571 |
-
"ق",
|
1572 |
-
"ج",
|
1573 |
-
"آ",
|
1574 |
-
"پ",
|
1575 |
-
"ح",
|
1576 |
-
"ط",
|
1577 |
-
"ص",
|
1578 |
-
],
|
1579 |
-
"Arabic": [
|
1580 |
-
"ا",
|
1581 |
-
"ل",
|
1582 |
-
"ي",
|
1583 |
-
"م",
|
1584 |
-
"و",
|
1585 |
-
"ن",
|
1586 |
-
"ر",
|
1587 |
-
"ت",
|
1588 |
-
"ب",
|
1589 |
-
"ة",
|
1590 |
-
"ع",
|
1591 |
-
"د",
|
1592 |
-
"س",
|
1593 |
-
"ف",
|
1594 |
-
"ه",
|
1595 |
-
"ك",
|
1596 |
-
"ق",
|
1597 |
-
"أ",
|
1598 |
-
"ح",
|
1599 |
-
"ج",
|
1600 |
-
"ش",
|
1601 |
-
"ط",
|
1602 |
-
"ص",
|
1603 |
-
"ى",
|
1604 |
-
"خ",
|
1605 |
-
"إ",
|
1606 |
-
],
|
1607 |
-
"Danish": [
|
1608 |
-
"e",
|
1609 |
-
"r",
|
1610 |
-
"n",
|
1611 |
-
"t",
|
1612 |
-
"a",
|
1613 |
-
"i",
|
1614 |
-
"s",
|
1615 |
-
"d",
|
1616 |
-
"l",
|
1617 |
-
"o",
|
1618 |
-
"g",
|
1619 |
-
"m",
|
1620 |
-
"k",
|
1621 |
-
"f",
|
1622 |
-
"v",
|
1623 |
-
"u",
|
1624 |
-
"b",
|
1625 |
-
"h",
|
1626 |
-
"p",
|
1627 |
-
"å",
|
1628 |
-
"y",
|
1629 |
-
"ø",
|
1630 |
-
"æ",
|
1631 |
-
"c",
|
1632 |
-
"j",
|
1633 |
-
"w",
|
1634 |
-
],
|
1635 |
-
"Serbian": [
|
1636 |
-
"а",
|
1637 |
-
"и",
|
1638 |
-
"о",
|
1639 |
-
"е",
|
1640 |
-
"н",
|
1641 |
-
"р",
|
1642 |
-
"с",
|
1643 |
-
"у",
|
1644 |
-
"т",
|
1645 |
-
"к",
|
1646 |
-
"ј",
|
1647 |
-
"в",
|
1648 |
-
"д",
|
1649 |
-
"м",
|
1650 |
-
"п",
|
1651 |
-
"л",
|
1652 |
-
"г",
|
1653 |
-
"з",
|
1654 |
-
"б",
|
1655 |
-
"a",
|
1656 |
-
"i",
|
1657 |
-
"e",
|
1658 |
-
"o",
|
1659 |
-
"n",
|
1660 |
-
"ц",
|
1661 |
-
"ш",
|
1662 |
-
],
|
1663 |
-
"Lithuanian": [
|
1664 |
-
"i",
|
1665 |
-
"a",
|
1666 |
-
"s",
|
1667 |
-
"o",
|
1668 |
-
"r",
|
1669 |
-
"e",
|
1670 |
-
"t",
|
1671 |
-
"n",
|
1672 |
-
"u",
|
1673 |
-
"k",
|
1674 |
-
"m",
|
1675 |
-
"l",
|
1676 |
-
"p",
|
1677 |
-
"v",
|
1678 |
-
"d",
|
1679 |
-
"j",
|
1680 |
-
"g",
|
1681 |
-
"ė",
|
1682 |
-
"b",
|
1683 |
-
"y",
|
1684 |
-
"ų",
|
1685 |
-
"š",
|
1686 |
-
"ž",
|
1687 |
-
"c",
|
1688 |
-
"ą",
|
1689 |
-
"į",
|
1690 |
-
],
|
1691 |
-
"Slovene": [
|
1692 |
-
"e",
|
1693 |
-
"a",
|
1694 |
-
"i",
|
1695 |
-
"o",
|
1696 |
-
"n",
|
1697 |
-
"r",
|
1698 |
-
"s",
|
1699 |
-
"l",
|
1700 |
-
"t",
|
1701 |
-
"j",
|
1702 |
-
"v",
|
1703 |
-
"k",
|
1704 |
-
"d",
|
1705 |
-
"p",
|
1706 |
-
"m",
|
1707 |
-
"u",
|
1708 |
-
"z",
|
1709 |
-
"b",
|
1710 |
-
"g",
|
1711 |
-
"h",
|
1712 |
-
"č",
|
1713 |
-
"c",
|
1714 |
-
"š",
|
1715 |
-
"ž",
|
1716 |
-
"f",
|
1717 |
-
"y",
|
1718 |
-
],
|
1719 |
-
"Slovak": [
|
1720 |
-
"o",
|
1721 |
-
"a",
|
1722 |
-
"e",
|
1723 |
-
"n",
|
1724 |
-
"i",
|
1725 |
-
"r",
|
1726 |
-
"v",
|
1727 |
-
"t",
|
1728 |
-
"s",
|
1729 |
-
"l",
|
1730 |
-
"k",
|
1731 |
-
"d",
|
1732 |
-
"m",
|
1733 |
-
"p",
|
1734 |
-
"u",
|
1735 |
-
"c",
|
1736 |
-
"h",
|
1737 |
-
"j",
|
1738 |
-
"b",
|
1739 |
-
"z",
|
1740 |
-
"á",
|
1741 |
-
"y",
|
1742 |
-
"ý",
|
1743 |
-
"í",
|
1744 |
-
"č",
|
1745 |
-
"é",
|
1746 |
-
],
|
1747 |
-
"Hebrew": [
|
1748 |
-
"י",
|
1749 |
-
"ו",
|
1750 |
-
"ה",
|
1751 |
-
"ל",
|
1752 |
-
"ר",
|
1753 |
-
"ב",
|
1754 |
-
"ת",
|
1755 |
-
"מ",
|
1756 |
-
"א",
|
1757 |
-
"ש",
|
1758 |
-
"נ",
|
1759 |
-
"ע",
|
1760 |
-
"ם",
|
1761 |
-
"ד",
|
1762 |
-
"ק",
|
1763 |
-
"ח",
|
1764 |
-
"פ",
|
1765 |
-
"ס",
|
1766 |
-
"כ",
|
1767 |
-
"ג",
|
1768 |
-
"ט",
|
1769 |
-
"צ",
|
1770 |
-
"ן",
|
1771 |
-
"ז",
|
1772 |
-
"ך",
|
1773 |
-
],
|
1774 |
-
"Bulgarian": [
|
1775 |
-
"а",
|
1776 |
-
"и",
|
1777 |
-
"о",
|
1778 |
-
"е",
|
1779 |
-
"н",
|
1780 |
-
"т",
|
1781 |
-
"р",
|
1782 |
-
"с",
|
1783 |
-
"в",
|
1784 |
-
"л",
|
1785 |
-
"к",
|
1786 |
-
"д",
|
1787 |
-
"п",
|
1788 |
-
"м",
|
1789 |
-
"з",
|
1790 |
-
"г",
|
1791 |
-
"я",
|
1792 |
-
"ъ",
|
1793 |
-
"у",
|
1794 |
-
"б",
|
1795 |
-
"ч",
|
1796 |
-
"ц",
|
1797 |
-
"й",
|
1798 |
-
"ж",
|
1799 |
-
"щ",
|
1800 |
-
"х",
|
1801 |
-
],
|
1802 |
-
"Croatian": [
|
1803 |
-
"a",
|
1804 |
-
"i",
|
1805 |
-
"o",
|
1806 |
-
"e",
|
1807 |
-
"n",
|
1808 |
-
"r",
|
1809 |
-
"j",
|
1810 |
-
"s",
|
1811 |
-
"t",
|
1812 |
-
"u",
|
1813 |
-
"k",
|
1814 |
-
"l",
|
1815 |
-
"v",
|
1816 |
-
"d",
|
1817 |
-
"m",
|
1818 |
-
"p",
|
1819 |
-
"g",
|
1820 |
-
"z",
|
1821 |
-
"b",
|
1822 |
-
"c",
|
1823 |
-
"č",
|
1824 |
-
"h",
|
1825 |
-
"š",
|
1826 |
-
"ž",
|
1827 |
-
"ć",
|
1828 |
-
"f",
|
1829 |
-
],
|
1830 |
-
"Hindi": [
|
1831 |
-
"क",
|
1832 |
-
"र",
|
1833 |
-
"स",
|
1834 |
-
"न",
|
1835 |
-
"त",
|
1836 |
-
"म",
|
1837 |
-
"ह",
|
1838 |
-
"प",
|
1839 |
-
"य",
|
1840 |
-
"ल",
|
1841 |
-
"व",
|
1842 |
-
"ज",
|
1843 |
-
"द",
|
1844 |
-
"ग",
|
1845 |
-
"ब",
|
1846 |
-
"श",
|
1847 |
-
"ट",
|
1848 |
-
"अ",
|
1849 |
-
"ए",
|
1850 |
-
"थ",
|
1851 |
-
"भ",
|
1852 |
-
"ड",
|
1853 |
-
"च",
|
1854 |
-
"ध",
|
1855 |
-
"ष",
|
1856 |
-
"इ",
|
1857 |
-
],
|
1858 |
-
"Estonian": [
|
1859 |
-
"a",
|
1860 |
-
"i",
|
1861 |
-
"e",
|
1862 |
-
"s",
|
1863 |
-
"t",
|
1864 |
-
"l",
|
1865 |
-
"u",
|
1866 |
-
"n",
|
1867 |
-
"o",
|
1868 |
-
"k",
|
1869 |
-
"r",
|
1870 |
-
"d",
|
1871 |
-
"m",
|
1872 |
-
"v",
|
1873 |
-
"g",
|
1874 |
-
"p",
|
1875 |
-
"j",
|
1876 |
-
"h",
|
1877 |
-
"ä",
|
1878 |
-
"b",
|
1879 |
-
"õ",
|
1880 |
-
"ü",
|
1881 |
-
"f",
|
1882 |
-
"c",
|
1883 |
-
"ö",
|
1884 |
-
"y",
|
1885 |
-
],
|
1886 |
-
"Thai": [
|
1887 |
-
"า",
|
1888 |
-
"น",
|
1889 |
-
"ร",
|
1890 |
-
"อ",
|
1891 |
-
"ก",
|
1892 |
-
"เ",
|
1893 |
-
"ง",
|
1894 |
-
"ม",
|
1895 |
-
"ย",
|
1896 |
-
"ล",
|
1897 |
-
"ว",
|
1898 |
-
"ด",
|
1899 |
-
"ท",
|
1900 |
-
"ส",
|
1901 |
-
"ต",
|
1902 |
-
"ะ",
|
1903 |
-
"ป",
|
1904 |
-
"บ",
|
1905 |
-
"ค",
|
1906 |
-
"ห",
|
1907 |
-
"แ",
|
1908 |
-
"จ",
|
1909 |
-
"พ",
|
1910 |
-
"ช",
|
1911 |
-
"ข",
|
1912 |
-
"ใ",
|
1913 |
-
],
|
1914 |
-
"Greek": [
|
1915 |
-
"α",
|
1916 |
-
"τ",
|
1917 |
-
"ο",
|
1918 |
-
"ι",
|
1919 |
-
"ε",
|
1920 |
-
"ν",
|
1921 |
-
"ρ",
|
1922 |
-
"σ",
|
1923 |
-
"κ",
|
1924 |
-
"η",
|
1925 |
-
"π",
|
1926 |
-
"ς",
|
1927 |
-
"υ",
|
1928 |
-
"μ",
|
1929 |
-
"λ",
|
1930 |
-
"ί",
|
1931 |
-
"ό",
|
1932 |
-
"ά",
|
1933 |
-
"γ",
|
1934 |
-
"έ",
|
1935 |
-
"δ",
|
1936 |
-
"ή",
|
1937 |
-
"ω",
|
1938 |
-
"χ",
|
1939 |
-
"θ",
|
1940 |
-
"ύ",
|
1941 |
-
],
|
1942 |
-
"Tamil": [
|
1943 |
-
"க",
|
1944 |
-
"த",
|
1945 |
-
"ப",
|
1946 |
-
"ட",
|
1947 |
-
"ர",
|
1948 |
-
"ம",
|
1949 |
-
"ல",
|
1950 |
-
"ன",
|
1951 |
-
"வ",
|
1952 |
-
"ற",
|
1953 |
-
"ய",
|
1954 |
-
"ள",
|
1955 |
-
"ச",
|
1956 |
-
"ந",
|
1957 |
-
"இ",
|
1958 |
-
"ண",
|
1959 |
-
"அ",
|
1960 |
-
"ஆ",
|
1961 |
-
"ழ",
|
1962 |
-
"ங",
|
1963 |
-
"எ",
|
1964 |
-
"உ",
|
1965 |
-
"ஒ",
|
1966 |
-
"ஸ",
|
1967 |
-
],
|
1968 |
-
"Kazakh": [
|
1969 |
-
"а",
|
1970 |
-
"ы",
|
1971 |
-
"е",
|
1972 |
-
"н",
|
1973 |
-
"т",
|
1974 |
-
"р",
|
1975 |
-
"л",
|
1976 |
-
"і",
|
1977 |
-
"д",
|
1978 |
-
"с",
|
1979 |
-
"м",
|
1980 |
-
"қ",
|
1981 |
-
"к",
|
1982 |
-
"о",
|
1983 |
-
"б",
|
1984 |
-
"и",
|
1985 |
-
"у",
|
1986 |
-
"ғ",
|
1987 |
-
"ж",
|
1988 |
-
"ң",
|
1989 |
-
"з",
|
1990 |
-
"ш",
|
1991 |
-
"й",
|
1992 |
-
"п",
|
1993 |
-
"г",
|
1994 |
-
"ө",
|
1995 |
-
],
|
1996 |
-
}
|
1997 |
-
|
1998 |
-
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/legacy.py
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from typing import TYPE_CHECKING, Any
|
4 |
-
from warnings import warn
|
5 |
-
|
6 |
-
from .api import from_bytes
|
7 |
-
from .constant import CHARDET_CORRESPONDENCE
|
8 |
-
|
9 |
-
# TODO: remove this check when dropping Python 3.7 support
|
10 |
-
if TYPE_CHECKING:
|
11 |
-
from typing_extensions import TypedDict
|
12 |
-
|
13 |
-
class ResultDict(TypedDict):
|
14 |
-
encoding: str | None
|
15 |
-
language: str
|
16 |
-
confidence: float | None
|
17 |
-
|
18 |
-
|
19 |
-
def detect(
|
20 |
-
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
21 |
-
) -> ResultDict:
|
22 |
-
"""
|
23 |
-
chardet legacy method
|
24 |
-
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
25 |
-
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
26 |
-
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
27 |
-
further information. Not planned for removal.
|
28 |
-
|
29 |
-
:param byte_str: The byte sequence to examine.
|
30 |
-
:param should_rename_legacy: Should we rename legacy encodings
|
31 |
-
to their more modern equivalents?
|
32 |
-
"""
|
33 |
-
if len(kwargs):
|
34 |
-
warn(
|
35 |
-
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
36 |
-
)
|
37 |
-
|
38 |
-
if not isinstance(byte_str, (bytearray, bytes)):
|
39 |
-
raise TypeError( # pragma: nocover
|
40 |
-
"Expected object of type bytes or bytearray, got: " "{}".format(
|
41 |
-
type(byte_str)
|
42 |
-
)
|
43 |
-
)
|
44 |
-
|
45 |
-
if isinstance(byte_str, bytearray):
|
46 |
-
byte_str = bytes(byte_str)
|
47 |
-
|
48 |
-
r = from_bytes(byte_str).best()
|
49 |
-
|
50 |
-
encoding = r.encoding if r is not None else None
|
51 |
-
language = r.language if r is not None and r.language != "Unknown" else ""
|
52 |
-
confidence = 1.0 - r.chaos if r is not None else None
|
53 |
-
|
54 |
-
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
55 |
-
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
56 |
-
if r is not None and encoding == "utf_8" and r.bom:
|
57 |
-
encoding += "_sig"
|
58 |
-
|
59 |
-
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
60 |
-
encoding = CHARDET_CORRESPONDENCE[encoding]
|
61 |
-
|
62 |
-
return {
|
63 |
-
"encoding": encoding,
|
64 |
-
"language": language,
|
65 |
-
"confidence": confidence,
|
66 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/md.py
DELETED
@@ -1,630 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from functools import lru_cache
|
4 |
-
from logging import getLogger
|
5 |
-
|
6 |
-
from .constant import (
|
7 |
-
COMMON_SAFE_ASCII_CHARACTERS,
|
8 |
-
TRACE,
|
9 |
-
UNICODE_SECONDARY_RANGE_KEYWORD,
|
10 |
-
)
|
11 |
-
from .utils import (
|
12 |
-
is_accentuated,
|
13 |
-
is_arabic,
|
14 |
-
is_arabic_isolated_form,
|
15 |
-
is_case_variable,
|
16 |
-
is_cjk,
|
17 |
-
is_emoticon,
|
18 |
-
is_hangul,
|
19 |
-
is_hiragana,
|
20 |
-
is_katakana,
|
21 |
-
is_latin,
|
22 |
-
is_punctuation,
|
23 |
-
is_separator,
|
24 |
-
is_symbol,
|
25 |
-
is_thai,
|
26 |
-
is_unprintable,
|
27 |
-
remove_accent,
|
28 |
-
unicode_range,
|
29 |
-
)
|
30 |
-
|
31 |
-
|
32 |
-
class MessDetectorPlugin:
|
33 |
-
"""
|
34 |
-
Base abstract class used for mess detection plugins.
|
35 |
-
All detectors MUST extend and implement given methods.
|
36 |
-
"""
|
37 |
-
|
38 |
-
def eligible(self, character: str) -> bool:
|
39 |
-
"""
|
40 |
-
Determine if given character should be fed in.
|
41 |
-
"""
|
42 |
-
raise NotImplementedError # pragma: nocover
|
43 |
-
|
44 |
-
def feed(self, character: str) -> None:
|
45 |
-
"""
|
46 |
-
The main routine to be executed upon character.
|
47 |
-
Insert the logic in witch the text would be considered chaotic.
|
48 |
-
"""
|
49 |
-
raise NotImplementedError # pragma: nocover
|
50 |
-
|
51 |
-
def reset(self) -> None: # pragma: no cover
|
52 |
-
"""
|
53 |
-
Permit to reset the plugin to the initial state.
|
54 |
-
"""
|
55 |
-
raise NotImplementedError
|
56 |
-
|
57 |
-
@property
|
58 |
-
def ratio(self) -> float:
|
59 |
-
"""
|
60 |
-
Compute the chaos ratio based on what your feed() has seen.
|
61 |
-
Must NOT be lower than 0.; No restriction gt 0.
|
62 |
-
"""
|
63 |
-
raise NotImplementedError # pragma: nocover
|
64 |
-
|
65 |
-
|
66 |
-
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
67 |
-
def __init__(self) -> None:
|
68 |
-
self._punctuation_count: int = 0
|
69 |
-
self._symbol_count: int = 0
|
70 |
-
self._character_count: int = 0
|
71 |
-
|
72 |
-
self._last_printable_char: str | None = None
|
73 |
-
self._frenzy_symbol_in_word: bool = False
|
74 |
-
|
75 |
-
def eligible(self, character: str) -> bool:
|
76 |
-
return character.isprintable()
|
77 |
-
|
78 |
-
def feed(self, character: str) -> None:
|
79 |
-
self._character_count += 1
|
80 |
-
|
81 |
-
if (
|
82 |
-
character != self._last_printable_char
|
83 |
-
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
84 |
-
):
|
85 |
-
if is_punctuation(character):
|
86 |
-
self._punctuation_count += 1
|
87 |
-
elif (
|
88 |
-
character.isdigit() is False
|
89 |
-
and is_symbol(character)
|
90 |
-
and is_emoticon(character) is False
|
91 |
-
):
|
92 |
-
self._symbol_count += 2
|
93 |
-
|
94 |
-
self._last_printable_char = character
|
95 |
-
|
96 |
-
def reset(self) -> None: # Abstract
|
97 |
-
self._punctuation_count = 0
|
98 |
-
self._character_count = 0
|
99 |
-
self._symbol_count = 0
|
100 |
-
|
101 |
-
@property
|
102 |
-
def ratio(self) -> float:
|
103 |
-
if self._character_count == 0:
|
104 |
-
return 0.0
|
105 |
-
|
106 |
-
ratio_of_punctuation: float = (
|
107 |
-
self._punctuation_count + self._symbol_count
|
108 |
-
) / self._character_count
|
109 |
-
|
110 |
-
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
111 |
-
|
112 |
-
|
113 |
-
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
114 |
-
def __init__(self) -> None:
|
115 |
-
self._character_count: int = 0
|
116 |
-
self._accentuated_count: int = 0
|
117 |
-
|
118 |
-
def eligible(self, character: str) -> bool:
|
119 |
-
return character.isalpha()
|
120 |
-
|
121 |
-
def feed(self, character: str) -> None:
|
122 |
-
self._character_count += 1
|
123 |
-
|
124 |
-
if is_accentuated(character):
|
125 |
-
self._accentuated_count += 1
|
126 |
-
|
127 |
-
def reset(self) -> None: # Abstract
|
128 |
-
self._character_count = 0
|
129 |
-
self._accentuated_count = 0
|
130 |
-
|
131 |
-
@property
|
132 |
-
def ratio(self) -> float:
|
133 |
-
if self._character_count < 8:
|
134 |
-
return 0.0
|
135 |
-
|
136 |
-
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
137 |
-
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
138 |
-
|
139 |
-
|
140 |
-
class UnprintablePlugin(MessDetectorPlugin):
|
141 |
-
def __init__(self) -> None:
|
142 |
-
self._unprintable_count: int = 0
|
143 |
-
self._character_count: int = 0
|
144 |
-
|
145 |
-
def eligible(self, character: str) -> bool:
|
146 |
-
return True
|
147 |
-
|
148 |
-
def feed(self, character: str) -> None:
|
149 |
-
if is_unprintable(character):
|
150 |
-
self._unprintable_count += 1
|
151 |
-
self._character_count += 1
|
152 |
-
|
153 |
-
def reset(self) -> None: # Abstract
|
154 |
-
self._unprintable_count = 0
|
155 |
-
|
156 |
-
@property
|
157 |
-
def ratio(self) -> float:
|
158 |
-
if self._character_count == 0:
|
159 |
-
return 0.0
|
160 |
-
|
161 |
-
return (self._unprintable_count * 8) / self._character_count
|
162 |
-
|
163 |
-
|
164 |
-
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
165 |
-
def __init__(self) -> None:
|
166 |
-
self._successive_count: int = 0
|
167 |
-
self._character_count: int = 0
|
168 |
-
|
169 |
-
self._last_latin_character: str | None = None
|
170 |
-
|
171 |
-
def eligible(self, character: str) -> bool:
|
172 |
-
return character.isalpha() and is_latin(character)
|
173 |
-
|
174 |
-
def feed(self, character: str) -> None:
|
175 |
-
self._character_count += 1
|
176 |
-
if (
|
177 |
-
self._last_latin_character is not None
|
178 |
-
and is_accentuated(character)
|
179 |
-
and is_accentuated(self._last_latin_character)
|
180 |
-
):
|
181 |
-
if character.isupper() and self._last_latin_character.isupper():
|
182 |
-
self._successive_count += 1
|
183 |
-
# Worse if its the same char duplicated with different accent.
|
184 |
-
if remove_accent(character) == remove_accent(self._last_latin_character):
|
185 |
-
self._successive_count += 1
|
186 |
-
self._last_latin_character = character
|
187 |
-
|
188 |
-
def reset(self) -> None: # Abstract
|
189 |
-
self._successive_count = 0
|
190 |
-
self._character_count = 0
|
191 |
-
self._last_latin_character = None
|
192 |
-
|
193 |
-
@property
|
194 |
-
def ratio(self) -> float:
|
195 |
-
if self._character_count == 0:
|
196 |
-
return 0.0
|
197 |
-
|
198 |
-
return (self._successive_count * 2) / self._character_count
|
199 |
-
|
200 |
-
|
201 |
-
class SuspiciousRange(MessDetectorPlugin):
|
202 |
-
def __init__(self) -> None:
|
203 |
-
self._suspicious_successive_range_count: int = 0
|
204 |
-
self._character_count: int = 0
|
205 |
-
self._last_printable_seen: str | None = None
|
206 |
-
|
207 |
-
def eligible(self, character: str) -> bool:
|
208 |
-
return character.isprintable()
|
209 |
-
|
210 |
-
def feed(self, character: str) -> None:
|
211 |
-
self._character_count += 1
|
212 |
-
|
213 |
-
if (
|
214 |
-
character.isspace()
|
215 |
-
or is_punctuation(character)
|
216 |
-
or character in COMMON_SAFE_ASCII_CHARACTERS
|
217 |
-
):
|
218 |
-
self._last_printable_seen = None
|
219 |
-
return
|
220 |
-
|
221 |
-
if self._last_printable_seen is None:
|
222 |
-
self._last_printable_seen = character
|
223 |
-
return
|
224 |
-
|
225 |
-
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
226 |
-
unicode_range_b: str | None = unicode_range(character)
|
227 |
-
|
228 |
-
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
229 |
-
self._suspicious_successive_range_count += 1
|
230 |
-
|
231 |
-
self._last_printable_seen = character
|
232 |
-
|
233 |
-
def reset(self) -> None: # Abstract
|
234 |
-
self._character_count = 0
|
235 |
-
self._suspicious_successive_range_count = 0
|
236 |
-
self._last_printable_seen = None
|
237 |
-
|
238 |
-
@property
|
239 |
-
def ratio(self) -> float:
|
240 |
-
if self._character_count <= 13:
|
241 |
-
return 0.0
|
242 |
-
|
243 |
-
ratio_of_suspicious_range_usage: float = (
|
244 |
-
self._suspicious_successive_range_count * 2
|
245 |
-
) / self._character_count
|
246 |
-
|
247 |
-
return ratio_of_suspicious_range_usage
|
248 |
-
|
249 |
-
|
250 |
-
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
251 |
-
def __init__(self) -> None:
|
252 |
-
self._word_count: int = 0
|
253 |
-
self._bad_word_count: int = 0
|
254 |
-
self._foreign_long_count: int = 0
|
255 |
-
|
256 |
-
self._is_current_word_bad: bool = False
|
257 |
-
self._foreign_long_watch: bool = False
|
258 |
-
|
259 |
-
self._character_count: int = 0
|
260 |
-
self._bad_character_count: int = 0
|
261 |
-
|
262 |
-
self._buffer: str = ""
|
263 |
-
self._buffer_accent_count: int = 0
|
264 |
-
self._buffer_glyph_count: int = 0
|
265 |
-
|
266 |
-
def eligible(self, character: str) -> bool:
|
267 |
-
return True
|
268 |
-
|
269 |
-
def feed(self, character: str) -> None:
|
270 |
-
if character.isalpha():
|
271 |
-
self._buffer += character
|
272 |
-
if is_accentuated(character):
|
273 |
-
self._buffer_accent_count += 1
|
274 |
-
if (
|
275 |
-
self._foreign_long_watch is False
|
276 |
-
and (is_latin(character) is False or is_accentuated(character))
|
277 |
-
and is_cjk(character) is False
|
278 |
-
and is_hangul(character) is False
|
279 |
-
and is_katakana(character) is False
|
280 |
-
and is_hiragana(character) is False
|
281 |
-
and is_thai(character) is False
|
282 |
-
):
|
283 |
-
self._foreign_long_watch = True
|
284 |
-
if (
|
285 |
-
is_cjk(character)
|
286 |
-
or is_hangul(character)
|
287 |
-
or is_katakana(character)
|
288 |
-
or is_hiragana(character)
|
289 |
-
or is_thai(character)
|
290 |
-
):
|
291 |
-
self._buffer_glyph_count += 1
|
292 |
-
return
|
293 |
-
if not self._buffer:
|
294 |
-
return
|
295 |
-
if (
|
296 |
-
character.isspace() or is_punctuation(character) or is_separator(character)
|
297 |
-
) and self._buffer:
|
298 |
-
self._word_count += 1
|
299 |
-
buffer_length: int = len(self._buffer)
|
300 |
-
|
301 |
-
self._character_count += buffer_length
|
302 |
-
|
303 |
-
if buffer_length >= 4:
|
304 |
-
if self._buffer_accent_count / buffer_length >= 0.5:
|
305 |
-
self._is_current_word_bad = True
|
306 |
-
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
307 |
-
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
308 |
-
elif (
|
309 |
-
is_accentuated(self._buffer[-1])
|
310 |
-
and self._buffer[-1].isupper()
|
311 |
-
and all(_.isupper() for _ in self._buffer) is False
|
312 |
-
):
|
313 |
-
self._foreign_long_count += 1
|
314 |
-
self._is_current_word_bad = True
|
315 |
-
elif self._buffer_glyph_count == 1:
|
316 |
-
self._is_current_word_bad = True
|
317 |
-
self._foreign_long_count += 1
|
318 |
-
if buffer_length >= 24 and self._foreign_long_watch:
|
319 |
-
camel_case_dst = [
|
320 |
-
i
|
321 |
-
for c, i in zip(self._buffer, range(0, buffer_length))
|
322 |
-
if c.isupper()
|
323 |
-
]
|
324 |
-
probable_camel_cased: bool = False
|
325 |
-
|
326 |
-
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
327 |
-
probable_camel_cased = True
|
328 |
-
|
329 |
-
if not probable_camel_cased:
|
330 |
-
self._foreign_long_count += 1
|
331 |
-
self._is_current_word_bad = True
|
332 |
-
|
333 |
-
if self._is_current_word_bad:
|
334 |
-
self._bad_word_count += 1
|
335 |
-
self._bad_character_count += len(self._buffer)
|
336 |
-
self._is_current_word_bad = False
|
337 |
-
|
338 |
-
self._foreign_long_watch = False
|
339 |
-
self._buffer = ""
|
340 |
-
self._buffer_accent_count = 0
|
341 |
-
self._buffer_glyph_count = 0
|
342 |
-
elif (
|
343 |
-
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
344 |
-
and character.isdigit() is False
|
345 |
-
and is_symbol(character)
|
346 |
-
):
|
347 |
-
self._is_current_word_bad = True
|
348 |
-
self._buffer += character
|
349 |
-
|
350 |
-
def reset(self) -> None: # Abstract
|
351 |
-
self._buffer = ""
|
352 |
-
self._is_current_word_bad = False
|
353 |
-
self._foreign_long_watch = False
|
354 |
-
self._bad_word_count = 0
|
355 |
-
self._word_count = 0
|
356 |
-
self._character_count = 0
|
357 |
-
self._bad_character_count = 0
|
358 |
-
self._foreign_long_count = 0
|
359 |
-
|
360 |
-
@property
|
361 |
-
def ratio(self) -> float:
|
362 |
-
if self._word_count <= 10 and self._foreign_long_count == 0:
|
363 |
-
return 0.0
|
364 |
-
|
365 |
-
return self._bad_character_count / self._character_count
|
366 |
-
|
367 |
-
|
368 |
-
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
369 |
-
"""
|
370 |
-
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
371 |
-
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
372 |
-
"""
|
373 |
-
|
374 |
-
def __init__(self) -> None:
|
375 |
-
self._wrong_stop_count: int = 0
|
376 |
-
self._cjk_character_count: int = 0
|
377 |
-
|
378 |
-
def eligible(self, character: str) -> bool:
|
379 |
-
return True
|
380 |
-
|
381 |
-
def feed(self, character: str) -> None:
|
382 |
-
if character in {"丅", "丄"}:
|
383 |
-
self._wrong_stop_count += 1
|
384 |
-
return
|
385 |
-
if is_cjk(character):
|
386 |
-
self._cjk_character_count += 1
|
387 |
-
|
388 |
-
def reset(self) -> None: # Abstract
|
389 |
-
self._wrong_stop_count = 0
|
390 |
-
self._cjk_character_count = 0
|
391 |
-
|
392 |
-
@property
|
393 |
-
def ratio(self) -> float:
|
394 |
-
if self._cjk_character_count < 16:
|
395 |
-
return 0.0
|
396 |
-
return self._wrong_stop_count / self._cjk_character_count
|
397 |
-
|
398 |
-
|
399 |
-
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
400 |
-
def __init__(self) -> None:
|
401 |
-
self._buf: bool = False
|
402 |
-
|
403 |
-
self._character_count_since_last_sep: int = 0
|
404 |
-
|
405 |
-
self._successive_upper_lower_count: int = 0
|
406 |
-
self._successive_upper_lower_count_final: int = 0
|
407 |
-
|
408 |
-
self._character_count: int = 0
|
409 |
-
|
410 |
-
self._last_alpha_seen: str | None = None
|
411 |
-
self._current_ascii_only: bool = True
|
412 |
-
|
413 |
-
def eligible(self, character: str) -> bool:
|
414 |
-
return True
|
415 |
-
|
416 |
-
def feed(self, character: str) -> None:
|
417 |
-
is_concerned = character.isalpha() and is_case_variable(character)
|
418 |
-
chunk_sep = is_concerned is False
|
419 |
-
|
420 |
-
if chunk_sep and self._character_count_since_last_sep > 0:
|
421 |
-
if (
|
422 |
-
self._character_count_since_last_sep <= 64
|
423 |
-
and character.isdigit() is False
|
424 |
-
and self._current_ascii_only is False
|
425 |
-
):
|
426 |
-
self._successive_upper_lower_count_final += (
|
427 |
-
self._successive_upper_lower_count
|
428 |
-
)
|
429 |
-
|
430 |
-
self._successive_upper_lower_count = 0
|
431 |
-
self._character_count_since_last_sep = 0
|
432 |
-
self._last_alpha_seen = None
|
433 |
-
self._buf = False
|
434 |
-
self._character_count += 1
|
435 |
-
self._current_ascii_only = True
|
436 |
-
|
437 |
-
return
|
438 |
-
|
439 |
-
if self._current_ascii_only is True and character.isascii() is False:
|
440 |
-
self._current_ascii_only = False
|
441 |
-
|
442 |
-
if self._last_alpha_seen is not None:
|
443 |
-
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
444 |
-
character.islower() and self._last_alpha_seen.isupper()
|
445 |
-
):
|
446 |
-
if self._buf is True:
|
447 |
-
self._successive_upper_lower_count += 2
|
448 |
-
self._buf = False
|
449 |
-
else:
|
450 |
-
self._buf = True
|
451 |
-
else:
|
452 |
-
self._buf = False
|
453 |
-
|
454 |
-
self._character_count += 1
|
455 |
-
self._character_count_since_last_sep += 1
|
456 |
-
self._last_alpha_seen = character
|
457 |
-
|
458 |
-
def reset(self) -> None: # Abstract
|
459 |
-
self._character_count = 0
|
460 |
-
self._character_count_since_last_sep = 0
|
461 |
-
self._successive_upper_lower_count = 0
|
462 |
-
self._successive_upper_lower_count_final = 0
|
463 |
-
self._last_alpha_seen = None
|
464 |
-
self._buf = False
|
465 |
-
self._current_ascii_only = True
|
466 |
-
|
467 |
-
@property
|
468 |
-
def ratio(self) -> float:
|
469 |
-
if self._character_count == 0:
|
470 |
-
return 0.0
|
471 |
-
|
472 |
-
return self._successive_upper_lower_count_final / self._character_count
|
473 |
-
|
474 |
-
|
475 |
-
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
476 |
-
def __init__(self) -> None:
|
477 |
-
self._character_count: int = 0
|
478 |
-
self._isolated_form_count: int = 0
|
479 |
-
|
480 |
-
def reset(self) -> None: # Abstract
|
481 |
-
self._character_count = 0
|
482 |
-
self._isolated_form_count = 0
|
483 |
-
|
484 |
-
def eligible(self, character: str) -> bool:
|
485 |
-
return is_arabic(character)
|
486 |
-
|
487 |
-
def feed(self, character: str) -> None:
|
488 |
-
self._character_count += 1
|
489 |
-
|
490 |
-
if is_arabic_isolated_form(character):
|
491 |
-
self._isolated_form_count += 1
|
492 |
-
|
493 |
-
@property
|
494 |
-
def ratio(self) -> float:
|
495 |
-
if self._character_count < 8:
|
496 |
-
return 0.0
|
497 |
-
|
498 |
-
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
499 |
-
|
500 |
-
return isolated_form_usage
|
501 |
-
|
502 |
-
|
503 |
-
@lru_cache(maxsize=1024)
|
504 |
-
def is_suspiciously_successive_range(
|
505 |
-
unicode_range_a: str | None, unicode_range_b: str | None
|
506 |
-
) -> bool:
|
507 |
-
"""
|
508 |
-
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
509 |
-
"""
|
510 |
-
if unicode_range_a is None or unicode_range_b is None:
|
511 |
-
return True
|
512 |
-
|
513 |
-
if unicode_range_a == unicode_range_b:
|
514 |
-
return False
|
515 |
-
|
516 |
-
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
517 |
-
return False
|
518 |
-
|
519 |
-
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
520 |
-
return False
|
521 |
-
|
522 |
-
# Latin characters can be accompanied with a combining diacritical mark
|
523 |
-
# eg. Vietnamese.
|
524 |
-
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
525 |
-
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
526 |
-
):
|
527 |
-
return False
|
528 |
-
|
529 |
-
keywords_range_a, keywords_range_b = (
|
530 |
-
unicode_range_a.split(" "),
|
531 |
-
unicode_range_b.split(" "),
|
532 |
-
)
|
533 |
-
|
534 |
-
for el in keywords_range_a:
|
535 |
-
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
536 |
-
continue
|
537 |
-
if el in keywords_range_b:
|
538 |
-
return False
|
539 |
-
|
540 |
-
# Japanese Exception
|
541 |
-
range_a_jp_chars, range_b_jp_chars = (
|
542 |
-
unicode_range_a
|
543 |
-
in (
|
544 |
-
"Hiragana",
|
545 |
-
"Katakana",
|
546 |
-
),
|
547 |
-
unicode_range_b in ("Hiragana", "Katakana"),
|
548 |
-
)
|
549 |
-
if (range_a_jp_chars or range_b_jp_chars) and (
|
550 |
-
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
551 |
-
):
|
552 |
-
return False
|
553 |
-
if range_a_jp_chars and range_b_jp_chars:
|
554 |
-
return False
|
555 |
-
|
556 |
-
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
557 |
-
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
558 |
-
return False
|
559 |
-
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
560 |
-
return False
|
561 |
-
|
562 |
-
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
563 |
-
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
564 |
-
unicode_range_a in ["Katakana", "Hiragana"]
|
565 |
-
and unicode_range_b in ["Katakana", "Hiragana"]
|
566 |
-
):
|
567 |
-
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
568 |
-
return False
|
569 |
-
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
570 |
-
return False
|
571 |
-
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
572 |
-
return False
|
573 |
-
|
574 |
-
return True
|
575 |
-
|
576 |
-
|
577 |
-
@lru_cache(maxsize=2048)
|
578 |
-
def mess_ratio(
|
579 |
-
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
580 |
-
) -> float:
|
581 |
-
"""
|
582 |
-
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
583 |
-
"""
|
584 |
-
|
585 |
-
detectors: list[MessDetectorPlugin] = [
|
586 |
-
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
587 |
-
]
|
588 |
-
|
589 |
-
length: int = len(decoded_sequence) + 1
|
590 |
-
|
591 |
-
mean_mess_ratio: float = 0.0
|
592 |
-
|
593 |
-
if length < 512:
|
594 |
-
intermediary_mean_mess_ratio_calc: int = 32
|
595 |
-
elif length <= 1024:
|
596 |
-
intermediary_mean_mess_ratio_calc = 64
|
597 |
-
else:
|
598 |
-
intermediary_mean_mess_ratio_calc = 128
|
599 |
-
|
600 |
-
for character, index in zip(decoded_sequence + "\n", range(length)):
|
601 |
-
for detector in detectors:
|
602 |
-
if detector.eligible(character):
|
603 |
-
detector.feed(character)
|
604 |
-
|
605 |
-
if (
|
606 |
-
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
607 |
-
) or index == length - 1:
|
608 |
-
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
609 |
-
|
610 |
-
if mean_mess_ratio >= maximum_threshold:
|
611 |
-
break
|
612 |
-
|
613 |
-
if debug:
|
614 |
-
logger = getLogger("charset_normalizer")
|
615 |
-
|
616 |
-
logger.log(
|
617 |
-
TRACE,
|
618 |
-
"Mess-detector extended-analysis start. "
|
619 |
-
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
620 |
-
f"maximum_threshold={maximum_threshold}",
|
621 |
-
)
|
622 |
-
|
623 |
-
if len(decoded_sequence) > 16:
|
624 |
-
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
625 |
-
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
626 |
-
|
627 |
-
for dt in detectors:
|
628 |
-
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
629 |
-
|
630 |
-
return round(mean_mess_ratio, 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/models.py
DELETED
@@ -1,360 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
from encodings.aliases import aliases
|
4 |
-
from hashlib import sha256
|
5 |
-
from json import dumps
|
6 |
-
from re import sub
|
7 |
-
from typing import Any, Iterator, List, Tuple
|
8 |
-
|
9 |
-
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
10 |
-
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
11 |
-
|
12 |
-
|
13 |
-
class CharsetMatch:
|
14 |
-
def __init__(
|
15 |
-
self,
|
16 |
-
payload: bytes,
|
17 |
-
guessed_encoding: str,
|
18 |
-
mean_mess_ratio: float,
|
19 |
-
has_sig_or_bom: bool,
|
20 |
-
languages: CoherenceMatches,
|
21 |
-
decoded_payload: str | None = None,
|
22 |
-
preemptive_declaration: str | None = None,
|
23 |
-
):
|
24 |
-
self._payload: bytes = payload
|
25 |
-
|
26 |
-
self._encoding: str = guessed_encoding
|
27 |
-
self._mean_mess_ratio: float = mean_mess_ratio
|
28 |
-
self._languages: CoherenceMatches = languages
|
29 |
-
self._has_sig_or_bom: bool = has_sig_or_bom
|
30 |
-
self._unicode_ranges: list[str] | None = None
|
31 |
-
|
32 |
-
self._leaves: list[CharsetMatch] = []
|
33 |
-
self._mean_coherence_ratio: float = 0.0
|
34 |
-
|
35 |
-
self._output_payload: bytes | None = None
|
36 |
-
self._output_encoding: str | None = None
|
37 |
-
|
38 |
-
self._string: str | None = decoded_payload
|
39 |
-
|
40 |
-
self._preemptive_declaration: str | None = preemptive_declaration
|
41 |
-
|
42 |
-
def __eq__(self, other: object) -> bool:
|
43 |
-
if not isinstance(other, CharsetMatch):
|
44 |
-
if isinstance(other, str):
|
45 |
-
return iana_name(other) == self.encoding
|
46 |
-
return False
|
47 |
-
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
48 |
-
|
49 |
-
def __lt__(self, other: object) -> bool:
|
50 |
-
"""
|
51 |
-
Implemented to make sorted available upon CharsetMatches items.
|
52 |
-
"""
|
53 |
-
if not isinstance(other, CharsetMatch):
|
54 |
-
raise ValueError
|
55 |
-
|
56 |
-
chaos_difference: float = abs(self.chaos - other.chaos)
|
57 |
-
coherence_difference: float = abs(self.coherence - other.coherence)
|
58 |
-
|
59 |
-
# Below 1% difference --> Use Coherence
|
60 |
-
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
61 |
-
return self.coherence > other.coherence
|
62 |
-
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
63 |
-
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
64 |
-
# preserve RAM usage!
|
65 |
-
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
66 |
-
return self.chaos < other.chaos
|
67 |
-
return self.multi_byte_usage > other.multi_byte_usage
|
68 |
-
|
69 |
-
return self.chaos < other.chaos
|
70 |
-
|
71 |
-
@property
|
72 |
-
def multi_byte_usage(self) -> float:
|
73 |
-
return 1.0 - (len(str(self)) / len(self.raw))
|
74 |
-
|
75 |
-
def __str__(self) -> str:
|
76 |
-
# Lazy Str Loading
|
77 |
-
if self._string is None:
|
78 |
-
self._string = str(self._payload, self._encoding, "strict")
|
79 |
-
return self._string
|
80 |
-
|
81 |
-
def __repr__(self) -> str:
|
82 |
-
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
83 |
-
|
84 |
-
def add_submatch(self, other: CharsetMatch) -> None:
|
85 |
-
if not isinstance(other, CharsetMatch) or other == self:
|
86 |
-
raise ValueError(
|
87 |
-
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
88 |
-
other.__class__
|
89 |
-
)
|
90 |
-
)
|
91 |
-
|
92 |
-
other._string = None # Unload RAM usage; dirty trick.
|
93 |
-
self._leaves.append(other)
|
94 |
-
|
95 |
-
@property
|
96 |
-
def encoding(self) -> str:
|
97 |
-
return self._encoding
|
98 |
-
|
99 |
-
@property
|
100 |
-
def encoding_aliases(self) -> list[str]:
|
101 |
-
"""
|
102 |
-
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
103 |
-
"""
|
104 |
-
also_known_as: list[str] = []
|
105 |
-
for u, p in aliases.items():
|
106 |
-
if self.encoding == u:
|
107 |
-
also_known_as.append(p)
|
108 |
-
elif self.encoding == p:
|
109 |
-
also_known_as.append(u)
|
110 |
-
return also_known_as
|
111 |
-
|
112 |
-
@property
|
113 |
-
def bom(self) -> bool:
|
114 |
-
return self._has_sig_or_bom
|
115 |
-
|
116 |
-
@property
|
117 |
-
def byte_order_mark(self) -> bool:
|
118 |
-
return self._has_sig_or_bom
|
119 |
-
|
120 |
-
@property
|
121 |
-
def languages(self) -> list[str]:
|
122 |
-
"""
|
123 |
-
Return the complete list of possible languages found in decoded sequence.
|
124 |
-
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
125 |
-
"""
|
126 |
-
return [e[0] for e in self._languages]
|
127 |
-
|
128 |
-
@property
|
129 |
-
def language(self) -> str:
|
130 |
-
"""
|
131 |
-
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
132 |
-
"Unknown".
|
133 |
-
"""
|
134 |
-
if not self._languages:
|
135 |
-
# Trying to infer the language based on the given encoding
|
136 |
-
# Its either English or we should not pronounce ourselves in certain cases.
|
137 |
-
if "ascii" in self.could_be_from_charset:
|
138 |
-
return "English"
|
139 |
-
|
140 |
-
# doing it there to avoid circular import
|
141 |
-
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
142 |
-
|
143 |
-
languages = (
|
144 |
-
mb_encoding_languages(self.encoding)
|
145 |
-
if is_multi_byte_encoding(self.encoding)
|
146 |
-
else encoding_languages(self.encoding)
|
147 |
-
)
|
148 |
-
|
149 |
-
if len(languages) == 0 or "Latin Based" in languages:
|
150 |
-
return "Unknown"
|
151 |
-
|
152 |
-
return languages[0]
|
153 |
-
|
154 |
-
return self._languages[0][0]
|
155 |
-
|
156 |
-
@property
|
157 |
-
def chaos(self) -> float:
|
158 |
-
return self._mean_mess_ratio
|
159 |
-
|
160 |
-
@property
|
161 |
-
def coherence(self) -> float:
|
162 |
-
if not self._languages:
|
163 |
-
return 0.0
|
164 |
-
return self._languages[0][1]
|
165 |
-
|
166 |
-
@property
|
167 |
-
def percent_chaos(self) -> float:
|
168 |
-
return round(self.chaos * 100, ndigits=3)
|
169 |
-
|
170 |
-
@property
|
171 |
-
def percent_coherence(self) -> float:
|
172 |
-
return round(self.coherence * 100, ndigits=3)
|
173 |
-
|
174 |
-
@property
|
175 |
-
def raw(self) -> bytes:
|
176 |
-
"""
|
177 |
-
Original untouched bytes.
|
178 |
-
"""
|
179 |
-
return self._payload
|
180 |
-
|
181 |
-
@property
|
182 |
-
def submatch(self) -> list[CharsetMatch]:
|
183 |
-
return self._leaves
|
184 |
-
|
185 |
-
@property
|
186 |
-
def has_submatch(self) -> bool:
|
187 |
-
return len(self._leaves) > 0
|
188 |
-
|
189 |
-
@property
|
190 |
-
def alphabets(self) -> list[str]:
|
191 |
-
if self._unicode_ranges is not None:
|
192 |
-
return self._unicode_ranges
|
193 |
-
# list detected ranges
|
194 |
-
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
195 |
-
# filter and sort
|
196 |
-
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
197 |
-
return self._unicode_ranges
|
198 |
-
|
199 |
-
@property
|
200 |
-
def could_be_from_charset(self) -> list[str]:
|
201 |
-
"""
|
202 |
-
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
203 |
-
encoding.
|
204 |
-
This list does include the encoding available in property 'encoding'.
|
205 |
-
"""
|
206 |
-
return [self._encoding] + [m.encoding for m in self._leaves]
|
207 |
-
|
208 |
-
def output(self, encoding: str = "utf_8") -> bytes:
|
209 |
-
"""
|
210 |
-
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
211 |
-
Any errors will be simply ignored by the encoder NOT replaced.
|
212 |
-
"""
|
213 |
-
if self._output_encoding is None or self._output_encoding != encoding:
|
214 |
-
self._output_encoding = encoding
|
215 |
-
decoded_string = str(self)
|
216 |
-
if (
|
217 |
-
self._preemptive_declaration is not None
|
218 |
-
and self._preemptive_declaration.lower()
|
219 |
-
not in ["utf-8", "utf8", "utf_8"]
|
220 |
-
):
|
221 |
-
patched_header = sub(
|
222 |
-
RE_POSSIBLE_ENCODING_INDICATION,
|
223 |
-
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
224 |
-
m.groups()[0],
|
225 |
-
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
226 |
-
),
|
227 |
-
decoded_string[:8192],
|
228 |
-
count=1,
|
229 |
-
)
|
230 |
-
|
231 |
-
decoded_string = patched_header + decoded_string[8192:]
|
232 |
-
|
233 |
-
self._output_payload = decoded_string.encode(encoding, "replace")
|
234 |
-
|
235 |
-
return self._output_payload # type: ignore
|
236 |
-
|
237 |
-
@property
|
238 |
-
def fingerprint(self) -> str:
|
239 |
-
"""
|
240 |
-
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
241 |
-
"""
|
242 |
-
return sha256(self.output()).hexdigest()
|
243 |
-
|
244 |
-
|
245 |
-
class CharsetMatches:
|
246 |
-
"""
|
247 |
-
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
248 |
-
Act like a list(iterable) but does not implements all related methods.
|
249 |
-
"""
|
250 |
-
|
251 |
-
def __init__(self, results: list[CharsetMatch] | None = None):
|
252 |
-
self._results: list[CharsetMatch] = sorted(results) if results else []
|
253 |
-
|
254 |
-
def __iter__(self) -> Iterator[CharsetMatch]:
|
255 |
-
yield from self._results
|
256 |
-
|
257 |
-
def __getitem__(self, item: int | str) -> CharsetMatch:
|
258 |
-
"""
|
259 |
-
Retrieve a single item either by its position or encoding name (alias may be used here).
|
260 |
-
Raise KeyError upon invalid index or encoding not present in results.
|
261 |
-
"""
|
262 |
-
if isinstance(item, int):
|
263 |
-
return self._results[item]
|
264 |
-
if isinstance(item, str):
|
265 |
-
item = iana_name(item, False)
|
266 |
-
for result in self._results:
|
267 |
-
if item in result.could_be_from_charset:
|
268 |
-
return result
|
269 |
-
raise KeyError
|
270 |
-
|
271 |
-
def __len__(self) -> int:
|
272 |
-
return len(self._results)
|
273 |
-
|
274 |
-
def __bool__(self) -> bool:
|
275 |
-
return len(self._results) > 0
|
276 |
-
|
277 |
-
def append(self, item: CharsetMatch) -> None:
|
278 |
-
"""
|
279 |
-
Insert a single match. Will be inserted accordingly to preserve sort.
|
280 |
-
Can be inserted as a submatch.
|
281 |
-
"""
|
282 |
-
if not isinstance(item, CharsetMatch):
|
283 |
-
raise ValueError(
|
284 |
-
"Cannot append instance '{}' to CharsetMatches".format(
|
285 |
-
str(item.__class__)
|
286 |
-
)
|
287 |
-
)
|
288 |
-
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
289 |
-
if len(item.raw) < TOO_BIG_SEQUENCE:
|
290 |
-
for match in self._results:
|
291 |
-
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
292 |
-
match.add_submatch(item)
|
293 |
-
return
|
294 |
-
self._results.append(item)
|
295 |
-
self._results = sorted(self._results)
|
296 |
-
|
297 |
-
def best(self) -> CharsetMatch | None:
|
298 |
-
"""
|
299 |
-
Simply return the first match. Strict equivalent to matches[0].
|
300 |
-
"""
|
301 |
-
if not self._results:
|
302 |
-
return None
|
303 |
-
return self._results[0]
|
304 |
-
|
305 |
-
def first(self) -> CharsetMatch | None:
|
306 |
-
"""
|
307 |
-
Redundant method, call the method best(). Kept for BC reasons.
|
308 |
-
"""
|
309 |
-
return self.best()
|
310 |
-
|
311 |
-
|
312 |
-
CoherenceMatch = Tuple[str, float]
|
313 |
-
CoherenceMatches = List[CoherenceMatch]
|
314 |
-
|
315 |
-
|
316 |
-
class CliDetectionResult:
|
317 |
-
def __init__(
|
318 |
-
self,
|
319 |
-
path: str,
|
320 |
-
encoding: str | None,
|
321 |
-
encoding_aliases: list[str],
|
322 |
-
alternative_encodings: list[str],
|
323 |
-
language: str,
|
324 |
-
alphabets: list[str],
|
325 |
-
has_sig_or_bom: bool,
|
326 |
-
chaos: float,
|
327 |
-
coherence: float,
|
328 |
-
unicode_path: str | None,
|
329 |
-
is_preferred: bool,
|
330 |
-
):
|
331 |
-
self.path: str = path
|
332 |
-
self.unicode_path: str | None = unicode_path
|
333 |
-
self.encoding: str | None = encoding
|
334 |
-
self.encoding_aliases: list[str] = encoding_aliases
|
335 |
-
self.alternative_encodings: list[str] = alternative_encodings
|
336 |
-
self.language: str = language
|
337 |
-
self.alphabets: list[str] = alphabets
|
338 |
-
self.has_sig_or_bom: bool = has_sig_or_bom
|
339 |
-
self.chaos: float = chaos
|
340 |
-
self.coherence: float = coherence
|
341 |
-
self.is_preferred: bool = is_preferred
|
342 |
-
|
343 |
-
@property
|
344 |
-
def __dict__(self) -> dict[str, Any]: # type: ignore
|
345 |
-
return {
|
346 |
-
"path": self.path,
|
347 |
-
"encoding": self.encoding,
|
348 |
-
"encoding_aliases": self.encoding_aliases,
|
349 |
-
"alternative_encodings": self.alternative_encodings,
|
350 |
-
"language": self.language,
|
351 |
-
"alphabets": self.alphabets,
|
352 |
-
"has_sig_or_bom": self.has_sig_or_bom,
|
353 |
-
"chaos": self.chaos,
|
354 |
-
"coherence": self.coherence,
|
355 |
-
"unicode_path": self.unicode_path,
|
356 |
-
"is_preferred": self.is_preferred,
|
357 |
-
}
|
358 |
-
|
359 |
-
def to_json(self) -> str:
|
360 |
-
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/py.typed
DELETED
File without changes
|
env/Lib/site-packages/charset_normalizer/utils.py
DELETED
@@ -1,408 +0,0 @@
|
|
1 |
-
from __future__ import annotations
|
2 |
-
|
3 |
-
import importlib
|
4 |
-
import logging
|
5 |
-
import unicodedata
|
6 |
-
from codecs import IncrementalDecoder
|
7 |
-
from encodings.aliases import aliases
|
8 |
-
from functools import lru_cache
|
9 |
-
from re import findall
|
10 |
-
from typing import Generator
|
11 |
-
|
12 |
-
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
13 |
-
MultibyteIncrementalDecoder,
|
14 |
-
)
|
15 |
-
|
16 |
-
from .constant import (
|
17 |
-
ENCODING_MARKS,
|
18 |
-
IANA_SUPPORTED_SIMILAR,
|
19 |
-
RE_POSSIBLE_ENCODING_INDICATION,
|
20 |
-
UNICODE_RANGES_COMBINED,
|
21 |
-
UNICODE_SECONDARY_RANGE_KEYWORD,
|
22 |
-
UTF8_MAXIMAL_ALLOCATION,
|
23 |
-
)
|
24 |
-
|
25 |
-
|
26 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
27 |
-
def is_accentuated(character: str) -> bool:
|
28 |
-
try:
|
29 |
-
description: str = unicodedata.name(character)
|
30 |
-
except ValueError: # Defensive: unicode database outdated?
|
31 |
-
return False
|
32 |
-
return (
|
33 |
-
"WITH GRAVE" in description
|
34 |
-
or "WITH ACUTE" in description
|
35 |
-
or "WITH CEDILLA" in description
|
36 |
-
or "WITH DIAERESIS" in description
|
37 |
-
or "WITH CIRCUMFLEX" in description
|
38 |
-
or "WITH TILDE" in description
|
39 |
-
or "WITH MACRON" in description
|
40 |
-
or "WITH RING ABOVE" in description
|
41 |
-
)
|
42 |
-
|
43 |
-
|
44 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
45 |
-
def remove_accent(character: str) -> str:
|
46 |
-
decomposed: str = unicodedata.decomposition(character)
|
47 |
-
if not decomposed:
|
48 |
-
return character
|
49 |
-
|
50 |
-
codes: list[str] = decomposed.split(" ")
|
51 |
-
|
52 |
-
return chr(int(codes[0], 16))
|
53 |
-
|
54 |
-
|
55 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
56 |
-
def unicode_range(character: str) -> str | None:
|
57 |
-
"""
|
58 |
-
Retrieve the Unicode range official name from a single character.
|
59 |
-
"""
|
60 |
-
character_ord: int = ord(character)
|
61 |
-
|
62 |
-
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
63 |
-
if character_ord in ord_range:
|
64 |
-
return range_name
|
65 |
-
|
66 |
-
return None
|
67 |
-
|
68 |
-
|
69 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
70 |
-
def is_latin(character: str) -> bool:
|
71 |
-
try:
|
72 |
-
description: str = unicodedata.name(character)
|
73 |
-
except ValueError: # Defensive: unicode database outdated?
|
74 |
-
return False
|
75 |
-
return "LATIN" in description
|
76 |
-
|
77 |
-
|
78 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
79 |
-
def is_punctuation(character: str) -> bool:
|
80 |
-
character_category: str = unicodedata.category(character)
|
81 |
-
|
82 |
-
if "P" in character_category:
|
83 |
-
return True
|
84 |
-
|
85 |
-
character_range: str | None = unicode_range(character)
|
86 |
-
|
87 |
-
if character_range is None:
|
88 |
-
return False
|
89 |
-
|
90 |
-
return "Punctuation" in character_range
|
91 |
-
|
92 |
-
|
93 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
94 |
-
def is_symbol(character: str) -> bool:
|
95 |
-
character_category: str = unicodedata.category(character)
|
96 |
-
|
97 |
-
if "S" in character_category or "N" in character_category:
|
98 |
-
return True
|
99 |
-
|
100 |
-
character_range: str | None = unicode_range(character)
|
101 |
-
|
102 |
-
if character_range is None:
|
103 |
-
return False
|
104 |
-
|
105 |
-
return "Forms" in character_range and character_category != "Lo"
|
106 |
-
|
107 |
-
|
108 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
109 |
-
def is_emoticon(character: str) -> bool:
|
110 |
-
character_range: str | None = unicode_range(character)
|
111 |
-
|
112 |
-
if character_range is None:
|
113 |
-
return False
|
114 |
-
|
115 |
-
return "Emoticons" in character_range or "Pictographs" in character_range
|
116 |
-
|
117 |
-
|
118 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
119 |
-
def is_separator(character: str) -> bool:
|
120 |
-
if character.isspace() or character in {"|", "+", "<", ">"}:
|
121 |
-
return True
|
122 |
-
|
123 |
-
character_category: str = unicodedata.category(character)
|
124 |
-
|
125 |
-
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
126 |
-
|
127 |
-
|
128 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
129 |
-
def is_case_variable(character: str) -> bool:
|
130 |
-
return character.islower() != character.isupper()
|
131 |
-
|
132 |
-
|
133 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
134 |
-
def is_cjk(character: str) -> bool:
|
135 |
-
try:
|
136 |
-
character_name = unicodedata.name(character)
|
137 |
-
except ValueError: # Defensive: unicode database outdated?
|
138 |
-
return False
|
139 |
-
|
140 |
-
return "CJK" in character_name
|
141 |
-
|
142 |
-
|
143 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
144 |
-
def is_hiragana(character: str) -> bool:
|
145 |
-
try:
|
146 |
-
character_name = unicodedata.name(character)
|
147 |
-
except ValueError: # Defensive: unicode database outdated?
|
148 |
-
return False
|
149 |
-
|
150 |
-
return "HIRAGANA" in character_name
|
151 |
-
|
152 |
-
|
153 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
154 |
-
def is_katakana(character: str) -> bool:
|
155 |
-
try:
|
156 |
-
character_name = unicodedata.name(character)
|
157 |
-
except ValueError: # Defensive: unicode database outdated?
|
158 |
-
return False
|
159 |
-
|
160 |
-
return "KATAKANA" in character_name
|
161 |
-
|
162 |
-
|
163 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
164 |
-
def is_hangul(character: str) -> bool:
|
165 |
-
try:
|
166 |
-
character_name = unicodedata.name(character)
|
167 |
-
except ValueError: # Defensive: unicode database outdated?
|
168 |
-
return False
|
169 |
-
|
170 |
-
return "HANGUL" in character_name
|
171 |
-
|
172 |
-
|
173 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
174 |
-
def is_thai(character: str) -> bool:
|
175 |
-
try:
|
176 |
-
character_name = unicodedata.name(character)
|
177 |
-
except ValueError: # Defensive: unicode database outdated?
|
178 |
-
return False
|
179 |
-
|
180 |
-
return "THAI" in character_name
|
181 |
-
|
182 |
-
|
183 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
184 |
-
def is_arabic(character: str) -> bool:
|
185 |
-
try:
|
186 |
-
character_name = unicodedata.name(character)
|
187 |
-
except ValueError: # Defensive: unicode database outdated?
|
188 |
-
return False
|
189 |
-
|
190 |
-
return "ARABIC" in character_name
|
191 |
-
|
192 |
-
|
193 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
194 |
-
def is_arabic_isolated_form(character: str) -> bool:
|
195 |
-
try:
|
196 |
-
character_name = unicodedata.name(character)
|
197 |
-
except ValueError: # Defensive: unicode database outdated?
|
198 |
-
return False
|
199 |
-
|
200 |
-
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
201 |
-
|
202 |
-
|
203 |
-
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
204 |
-
def is_unicode_range_secondary(range_name: str) -> bool:
|
205 |
-
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
206 |
-
|
207 |
-
|
208 |
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
209 |
-
def is_unprintable(character: str) -> bool:
|
210 |
-
return (
|
211 |
-
character.isspace() is False # includes \n \t \r \v
|
212 |
-
and character.isprintable() is False
|
213 |
-
and character != "\x1a" # Why? Its the ASCII substitute character.
|
214 |
-
and character != "\ufeff" # bug discovered in Python,
|
215 |
-
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
216 |
-
)
|
217 |
-
|
218 |
-
|
219 |
-
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
220 |
-
"""
|
221 |
-
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
222 |
-
"""
|
223 |
-
if not isinstance(sequence, bytes):
|
224 |
-
raise TypeError
|
225 |
-
|
226 |
-
seq_len: int = len(sequence)
|
227 |
-
|
228 |
-
results: list[str] = findall(
|
229 |
-
RE_POSSIBLE_ENCODING_INDICATION,
|
230 |
-
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
231 |
-
)
|
232 |
-
|
233 |
-
if len(results) == 0:
|
234 |
-
return None
|
235 |
-
|
236 |
-
for specified_encoding in results:
|
237 |
-
specified_encoding = specified_encoding.lower().replace("-", "_")
|
238 |
-
|
239 |
-
encoding_alias: str
|
240 |
-
encoding_iana: str
|
241 |
-
|
242 |
-
for encoding_alias, encoding_iana in aliases.items():
|
243 |
-
if encoding_alias == specified_encoding:
|
244 |
-
return encoding_iana
|
245 |
-
if encoding_iana == specified_encoding:
|
246 |
-
return encoding_iana
|
247 |
-
|
248 |
-
return None
|
249 |
-
|
250 |
-
|
251 |
-
@lru_cache(maxsize=128)
|
252 |
-
def is_multi_byte_encoding(name: str) -> bool:
|
253 |
-
"""
|
254 |
-
Verify is a specific encoding is a multi byte one based on it IANA name
|
255 |
-
"""
|
256 |
-
return name in {
|
257 |
-
"utf_8",
|
258 |
-
"utf_8_sig",
|
259 |
-
"utf_16",
|
260 |
-
"utf_16_be",
|
261 |
-
"utf_16_le",
|
262 |
-
"utf_32",
|
263 |
-
"utf_32_le",
|
264 |
-
"utf_32_be",
|
265 |
-
"utf_7",
|
266 |
-
} or issubclass(
|
267 |
-
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
268 |
-
MultibyteIncrementalDecoder,
|
269 |
-
)
|
270 |
-
|
271 |
-
|
272 |
-
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
273 |
-
"""
|
274 |
-
Identify and extract SIG/BOM in given sequence.
|
275 |
-
"""
|
276 |
-
|
277 |
-
for iana_encoding in ENCODING_MARKS:
|
278 |
-
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
279 |
-
|
280 |
-
if isinstance(marks, bytes):
|
281 |
-
marks = [marks]
|
282 |
-
|
283 |
-
for mark in marks:
|
284 |
-
if sequence.startswith(mark):
|
285 |
-
return iana_encoding, mark
|
286 |
-
|
287 |
-
return None, b""
|
288 |
-
|
289 |
-
|
290 |
-
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
291 |
-
return iana_encoding not in {"utf_16", "utf_32"}
|
292 |
-
|
293 |
-
|
294 |
-
def iana_name(cp_name: str, strict: bool = True) -> str:
|
295 |
-
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
296 |
-
cp_name = cp_name.lower().replace("-", "_")
|
297 |
-
|
298 |
-
encoding_alias: str
|
299 |
-
encoding_iana: str
|
300 |
-
|
301 |
-
for encoding_alias, encoding_iana in aliases.items():
|
302 |
-
if cp_name in [encoding_alias, encoding_iana]:
|
303 |
-
return encoding_iana
|
304 |
-
|
305 |
-
if strict:
|
306 |
-
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
307 |
-
|
308 |
-
return cp_name
|
309 |
-
|
310 |
-
|
311 |
-
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
312 |
-
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
313 |
-
return 0.0
|
314 |
-
|
315 |
-
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
316 |
-
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
317 |
-
|
318 |
-
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
319 |
-
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
320 |
-
|
321 |
-
character_match_count: int = 0
|
322 |
-
|
323 |
-
for i in range(255):
|
324 |
-
to_be_decoded: bytes = bytes([i])
|
325 |
-
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
326 |
-
character_match_count += 1
|
327 |
-
|
328 |
-
return character_match_count / 254
|
329 |
-
|
330 |
-
|
331 |
-
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
332 |
-
"""
|
333 |
-
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
334 |
-
the function cp_similarity.
|
335 |
-
"""
|
336 |
-
return (
|
337 |
-
iana_name_a in IANA_SUPPORTED_SIMILAR
|
338 |
-
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
339 |
-
)
|
340 |
-
|
341 |
-
|
342 |
-
def set_logging_handler(
|
343 |
-
name: str = "charset_normalizer",
|
344 |
-
level: int = logging.INFO,
|
345 |
-
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
346 |
-
) -> None:
|
347 |
-
logger = logging.getLogger(name)
|
348 |
-
logger.setLevel(level)
|
349 |
-
|
350 |
-
handler = logging.StreamHandler()
|
351 |
-
handler.setFormatter(logging.Formatter(format_string))
|
352 |
-
logger.addHandler(handler)
|
353 |
-
|
354 |
-
|
355 |
-
def cut_sequence_chunks(
|
356 |
-
sequences: bytes,
|
357 |
-
encoding_iana: str,
|
358 |
-
offsets: range,
|
359 |
-
chunk_size: int,
|
360 |
-
bom_or_sig_available: bool,
|
361 |
-
strip_sig_or_bom: bool,
|
362 |
-
sig_payload: bytes,
|
363 |
-
is_multi_byte_decoder: bool,
|
364 |
-
decoded_payload: str | None = None,
|
365 |
-
) -> Generator[str, None, None]:
|
366 |
-
if decoded_payload and is_multi_byte_decoder is False:
|
367 |
-
for i in offsets:
|
368 |
-
chunk = decoded_payload[i : i + chunk_size]
|
369 |
-
if not chunk:
|
370 |
-
break
|
371 |
-
yield chunk
|
372 |
-
else:
|
373 |
-
for i in offsets:
|
374 |
-
chunk_end = i + chunk_size
|
375 |
-
if chunk_end > len(sequences) + 8:
|
376 |
-
continue
|
377 |
-
|
378 |
-
cut_sequence = sequences[i : i + chunk_size]
|
379 |
-
|
380 |
-
if bom_or_sig_available and strip_sig_or_bom is False:
|
381 |
-
cut_sequence = sig_payload + cut_sequence
|
382 |
-
|
383 |
-
chunk = cut_sequence.decode(
|
384 |
-
encoding_iana,
|
385 |
-
errors="ignore" if is_multi_byte_decoder else "strict",
|
386 |
-
)
|
387 |
-
|
388 |
-
# multi-byte bad cutting detector and adjustment
|
389 |
-
# not the cleanest way to perform that fix but clever enough for now.
|
390 |
-
if is_multi_byte_decoder and i > 0:
|
391 |
-
chunk_partial_size_chk: int = min(chunk_size, 16)
|
392 |
-
|
393 |
-
if (
|
394 |
-
decoded_payload
|
395 |
-
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
396 |
-
):
|
397 |
-
for j in range(i, i - 4, -1):
|
398 |
-
cut_sequence = sequences[j:chunk_end]
|
399 |
-
|
400 |
-
if bom_or_sig_available and strip_sig_or_bom is False:
|
401 |
-
cut_sequence = sig_payload + cut_sequence
|
402 |
-
|
403 |
-
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
404 |
-
|
405 |
-
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
406 |
-
break
|
407 |
-
|
408 |
-
yield chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/charset_normalizer/version.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Expose version
|
3 |
-
"""
|
4 |
-
|
5 |
-
from __future__ import annotations
|
6 |
-
|
7 |
-
__version__ = "3.4.1"
|
8 |
-
VERSION = __version__.split(".")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
pip
|
|
|
|
env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA
DELETED
@@ -1,441 +0,0 @@
|
|
1 |
-
Metadata-Version: 2.1
|
2 |
-
Name: colorama
|
3 |
-
Version: 0.4.6
|
4 |
-
Summary: Cross-platform colored terminal text.
|
5 |
-
Project-URL: Homepage, https://github.com/tartley/colorama
|
6 |
-
Author-email: Jonathan Hartley <[email protected]>
|
7 |
-
License-File: LICENSE.txt
|
8 |
-
Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
|
9 |
-
Classifier: Development Status :: 5 - Production/Stable
|
10 |
-
Classifier: Environment :: Console
|
11 |
-
Classifier: Intended Audience :: Developers
|
12 |
-
Classifier: License :: OSI Approved :: BSD License
|
13 |
-
Classifier: Operating System :: OS Independent
|
14 |
-
Classifier: Programming Language :: Python
|
15 |
-
Classifier: Programming Language :: Python :: 2
|
16 |
-
Classifier: Programming Language :: Python :: 2.7
|
17 |
-
Classifier: Programming Language :: Python :: 3
|
18 |
-
Classifier: Programming Language :: Python :: 3.7
|
19 |
-
Classifier: Programming Language :: Python :: 3.8
|
20 |
-
Classifier: Programming Language :: Python :: 3.9
|
21 |
-
Classifier: Programming Language :: Python :: 3.10
|
22 |
-
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23 |
-
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
24 |
-
Classifier: Topic :: Terminals
|
25 |
-
Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
|
26 |
-
Description-Content-Type: text/x-rst
|
27 |
-
|
28 |
-
.. image:: https://img.shields.io/pypi/v/colorama.svg
|
29 |
-
:target: https://pypi.org/project/colorama/
|
30 |
-
:alt: Latest Version
|
31 |
-
|
32 |
-
.. image:: https://img.shields.io/pypi/pyversions/colorama.svg
|
33 |
-
:target: https://pypi.org/project/colorama/
|
34 |
-
:alt: Supported Python versions
|
35 |
-
|
36 |
-
.. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
|
37 |
-
:target: https://github.com/tartley/colorama/actions/workflows/test.yml
|
38 |
-
:alt: Build Status
|
39 |
-
|
40 |
-
Colorama
|
41 |
-
========
|
42 |
-
|
43 |
-
Makes ANSI escape character sequences (for producing colored terminal text and
|
44 |
-
cursor positioning) work under MS Windows.
|
45 |
-
|
46 |
-
.. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
|
47 |
-
:target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama¤cy_code=USD
|
48 |
-
:alt: Donate with Paypal
|
49 |
-
|
50 |
-
`PyPI for releases <https://pypi.org/project/colorama/>`_ |
|
51 |
-
`Github for source <https://github.com/tartley/colorama>`_ |
|
52 |
-
`Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
|
53 |
-
|
54 |
-
If you find Colorama useful, please |donate| to the authors. Thank you!
|
55 |
-
|
56 |
-
Installation
|
57 |
-
------------
|
58 |
-
|
59 |
-
Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
|
60 |
-
|
61 |
-
No requirements other than the standard library.
|
62 |
-
|
63 |
-
.. code-block:: bash
|
64 |
-
|
65 |
-
pip install colorama
|
66 |
-
# or
|
67 |
-
conda install -c anaconda colorama
|
68 |
-
|
69 |
-
Description
|
70 |
-
-----------
|
71 |
-
|
72 |
-
ANSI escape character sequences have long been used to produce colored terminal
|
73 |
-
text and cursor positioning on Unix and Macs. Colorama makes this work on
|
74 |
-
Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
|
75 |
-
would appear as gobbledygook in the output), and converting them into the
|
76 |
-
appropriate win32 calls to modify the state of the terminal. On other platforms,
|
77 |
-
Colorama does nothing.
|
78 |
-
|
79 |
-
This has the upshot of providing a simple cross-platform API for printing
|
80 |
-
colored terminal text from Python, and has the happy side-effect that existing
|
81 |
-
applications or libraries which use ANSI sequences to produce colored output on
|
82 |
-
Linux or Macs can now also work on Windows, simply by calling
|
83 |
-
``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
|
84 |
-
(all versions, but may have other side-effects – see below).
|
85 |
-
|
86 |
-
An alternative approach is to install ``ansi.sys`` on Windows machines, which
|
87 |
-
provides the same behaviour for all applications running in terminals. Colorama
|
88 |
-
is intended for situations where that isn't easy (e.g., maybe your app doesn't
|
89 |
-
have an installer.)
|
90 |
-
|
91 |
-
Demo scripts in the source code repository print some colored text using
|
92 |
-
ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
|
93 |
-
handling, versus on Windows Command-Prompt using Colorama:
|
94 |
-
|
95 |
-
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
|
96 |
-
:width: 661
|
97 |
-
:height: 357
|
98 |
-
:alt: ANSI sequences on Ubuntu under gnome-terminal.
|
99 |
-
|
100 |
-
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
|
101 |
-
:width: 668
|
102 |
-
:height: 325
|
103 |
-
:alt: Same ANSI sequences on Windows, using Colorama.
|
104 |
-
|
105 |
-
These screenshots show that, on Windows, Colorama does not support ANSI 'dim
|
106 |
-
text'; it looks the same as 'normal text'.
|
107 |
-
|
108 |
-
Usage
|
109 |
-
-----
|
110 |
-
|
111 |
-
Initialisation
|
112 |
-
..............
|
113 |
-
|
114 |
-
If the only thing you want from Colorama is to get ANSI escapes to work on
|
115 |
-
Windows, then run:
|
116 |
-
|
117 |
-
.. code-block:: python
|
118 |
-
|
119 |
-
from colorama import just_fix_windows_console
|
120 |
-
just_fix_windows_console()
|
121 |
-
|
122 |
-
If you're on a recent version of Windows 10 or better, and your stdout/stderr
|
123 |
-
are pointing to a Windows console, then this will flip the magic configuration
|
124 |
-
switch to enable Windows' built-in ANSI support.
|
125 |
-
|
126 |
-
If you're on an older version of Windows, and your stdout/stderr are pointing to
|
127 |
-
a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
|
128 |
-
magic file object that intercepts ANSI escape sequences and issues the
|
129 |
-
appropriate Win32 calls to emulate them.
|
130 |
-
|
131 |
-
In all other circumstances, it does nothing whatsoever. Basically the idea is
|
132 |
-
that this makes Windows act like Unix with respect to ANSI escape handling.
|
133 |
-
|
134 |
-
It's safe to call this function multiple times. It's safe to call this function
|
135 |
-
on non-Windows platforms, but it won't do anything. It's safe to call this
|
136 |
-
function when one or both of your stdout/stderr are redirected to a file – it
|
137 |
-
won't do anything to those streams.
|
138 |
-
|
139 |
-
Alternatively, you can use the older interface with more features (but also more
|
140 |
-
potential footguns):
|
141 |
-
|
142 |
-
.. code-block:: python
|
143 |
-
|
144 |
-
from colorama import init
|
145 |
-
init()
|
146 |
-
|
147 |
-
This does the same thing as ``just_fix_windows_console``, except for the
|
148 |
-
following differences:
|
149 |
-
|
150 |
-
- It's not safe to call ``init`` multiple times; you can end up with multiple
|
151 |
-
layers of wrapping and broken ANSI support.
|
152 |
-
|
153 |
-
- Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
|
154 |
-
and if it thinks they don't, then it will wrap ``sys.stdout`` and
|
155 |
-
``sys.stderr`` in a magic file object that strips out ANSI escape sequences
|
156 |
-
before printing them. This happens on all platforms, and can be convenient if
|
157 |
-
you want to write your code to emit ANSI escape sequences unconditionally, and
|
158 |
-
let Colorama decide whether they should actually be output. But note that
|
159 |
-
Colorama's heuristic is not particularly clever.
|
160 |
-
|
161 |
-
- ``init`` also accepts explicit keyword args to enable/disable various
|
162 |
-
functionality – see below.
|
163 |
-
|
164 |
-
To stop using Colorama before your program exits, simply call ``deinit()``.
|
165 |
-
This will restore ``stdout`` and ``stderr`` to their original values, so that
|
166 |
-
Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
|
167 |
-
cheaper than calling ``init()`` again (but does the same thing).
|
168 |
-
|
169 |
-
Most users should depend on ``colorama >= 0.4.6``, and use
|
170 |
-
``just_fix_windows_console``. The old ``init`` interface will be supported
|
171 |
-
indefinitely for backwards compatibility, but we don't plan to fix any issues
|
172 |
-
with it, also for backwards compatibility.
|
173 |
-
|
174 |
-
Colored Output
|
175 |
-
..............
|
176 |
-
|
177 |
-
Cross-platform printing of colored text can then be done using Colorama's
|
178 |
-
constant shorthand for ANSI escape sequences. These are deliberately
|
179 |
-
rudimentary, see below.
|
180 |
-
|
181 |
-
.. code-block:: python
|
182 |
-
|
183 |
-
from colorama import Fore, Back, Style
|
184 |
-
print(Fore.RED + 'some red text')
|
185 |
-
print(Back.GREEN + 'and with a green background')
|
186 |
-
print(Style.DIM + 'and in dim text')
|
187 |
-
print(Style.RESET_ALL)
|
188 |
-
print('back to normal now')
|
189 |
-
|
190 |
-
...or simply by manually printing ANSI sequences from your own code:
|
191 |
-
|
192 |
-
.. code-block:: python
|
193 |
-
|
194 |
-
print('\033[31m' + 'some red text')
|
195 |
-
print('\033[39m') # and reset to default color
|
196 |
-
|
197 |
-
...or, Colorama can be used in conjunction with existing ANSI libraries
|
198 |
-
such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
|
199 |
-
the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
|
200 |
-
or the incredible `_Rich <https://pypi.org/project/rich/>`_.
|
201 |
-
|
202 |
-
If you wish Colorama's Fore, Back and Style constants were more capable,
|
203 |
-
then consider using one of the above highly capable libraries to generate
|
204 |
-
colors, etc, and use Colorama just for its primary purpose: to convert
|
205 |
-
those ANSI sequences to also work on Windows:
|
206 |
-
|
207 |
-
SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
|
208 |
-
We are only interested in converting ANSI codes to win32 API calls, not
|
209 |
-
shortcuts like the above to generate ANSI characters.
|
210 |
-
|
211 |
-
.. code-block:: python
|
212 |
-
|
213 |
-
from colorama import just_fix_windows_console
|
214 |
-
from termcolor import colored
|
215 |
-
|
216 |
-
# use Colorama to make Termcolor work on Windows too
|
217 |
-
just_fix_windows_console()
|
218 |
-
|
219 |
-
# then use Termcolor for all colored text output
|
220 |
-
print(colored('Hello, World!', 'green', 'on_red'))
|
221 |
-
|
222 |
-
Available formatting constants are::
|
223 |
-
|
224 |
-
Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
225 |
-
Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
226 |
-
Style: DIM, NORMAL, BRIGHT, RESET_ALL
|
227 |
-
|
228 |
-
``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
|
229 |
-
perform this reset automatically on program exit.
|
230 |
-
|
231 |
-
These are fairly well supported, but not part of the standard::
|
232 |
-
|
233 |
-
Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
234 |
-
Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
235 |
-
|
236 |
-
Cursor Positioning
|
237 |
-
..................
|
238 |
-
|
239 |
-
ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
|
240 |
-
an example of how to generate them.
|
241 |
-
|
242 |
-
Init Keyword Args
|
243 |
-
.................
|
244 |
-
|
245 |
-
``init()`` accepts some ``**kwargs`` to override default behaviour.
|
246 |
-
|
247 |
-
init(autoreset=False):
|
248 |
-
If you find yourself repeatedly sending reset sequences to turn off color
|
249 |
-
changes at the end of every print, then ``init(autoreset=True)`` will
|
250 |
-
automate that:
|
251 |
-
|
252 |
-
.. code-block:: python
|
253 |
-
|
254 |
-
from colorama import init
|
255 |
-
init(autoreset=True)
|
256 |
-
print(Fore.RED + 'some red text')
|
257 |
-
print('automatically back to default color again')
|
258 |
-
|
259 |
-
init(strip=None):
|
260 |
-
Pass ``True`` or ``False`` to override whether ANSI codes should be
|
261 |
-
stripped from the output. The default behaviour is to strip if on Windows
|
262 |
-
or if output is redirected (not a tty).
|
263 |
-
|
264 |
-
init(convert=None):
|
265 |
-
Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
|
266 |
-
output into win32 calls. The default behaviour is to convert if on Windows
|
267 |
-
and output is to a tty (terminal).
|
268 |
-
|
269 |
-
init(wrap=True):
|
270 |
-
On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
|
271 |
-
with proxy objects, which override the ``.write()`` method to do their work.
|
272 |
-
If this wrapping causes you problems, then this can be disabled by passing
|
273 |
-
``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
|
274 |
-
``strip`` or ``convert`` are True.
|
275 |
-
|
276 |
-
When wrapping is disabled, colored printing on non-Windows platforms will
|
277 |
-
continue to work as normal. To do cross-platform colored output, you can
|
278 |
-
use Colorama's ``AnsiToWin32`` proxy directly:
|
279 |
-
|
280 |
-
.. code-block:: python
|
281 |
-
|
282 |
-
import sys
|
283 |
-
from colorama import init, AnsiToWin32
|
284 |
-
init(wrap=False)
|
285 |
-
stream = AnsiToWin32(sys.stderr).stream
|
286 |
-
|
287 |
-
# Python 2
|
288 |
-
print >>stream, Fore.BLUE + 'blue text on stderr'
|
289 |
-
|
290 |
-
# Python 3
|
291 |
-
print(Fore.BLUE + 'blue text on stderr', file=stream)
|
292 |
-
|
293 |
-
Recognised ANSI Sequences
|
294 |
-
.........................
|
295 |
-
|
296 |
-
ANSI sequences generally take the form::
|
297 |
-
|
298 |
-
ESC [ <param> ; <param> ... <command>
|
299 |
-
|
300 |
-
Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
|
301 |
-
more params are passed to a ``<command>``. If no params are passed, it is
|
302 |
-
generally synonymous with passing a single zero. No spaces exist in the
|
303 |
-
sequence; they have been inserted here simply to read more easily.
|
304 |
-
|
305 |
-
The only ANSI sequences that Colorama converts into win32 calls are::
|
306 |
-
|
307 |
-
ESC [ 0 m # reset all (colors and brightness)
|
308 |
-
ESC [ 1 m # bright
|
309 |
-
ESC [ 2 m # dim (looks same as normal brightness)
|
310 |
-
ESC [ 22 m # normal brightness
|
311 |
-
|
312 |
-
# FOREGROUND:
|
313 |
-
ESC [ 30 m # black
|
314 |
-
ESC [ 31 m # red
|
315 |
-
ESC [ 32 m # green
|
316 |
-
ESC [ 33 m # yellow
|
317 |
-
ESC [ 34 m # blue
|
318 |
-
ESC [ 35 m # magenta
|
319 |
-
ESC [ 36 m # cyan
|
320 |
-
ESC [ 37 m # white
|
321 |
-
ESC [ 39 m # reset
|
322 |
-
|
323 |
-
# BACKGROUND
|
324 |
-
ESC [ 40 m # black
|
325 |
-
ESC [ 41 m # red
|
326 |
-
ESC [ 42 m # green
|
327 |
-
ESC [ 43 m # yellow
|
328 |
-
ESC [ 44 m # blue
|
329 |
-
ESC [ 45 m # magenta
|
330 |
-
ESC [ 46 m # cyan
|
331 |
-
ESC [ 47 m # white
|
332 |
-
ESC [ 49 m # reset
|
333 |
-
|
334 |
-
# cursor positioning
|
335 |
-
ESC [ y;x H # position cursor at x across, y down
|
336 |
-
ESC [ y;x f # position cursor at x across, y down
|
337 |
-
ESC [ n A # move cursor n lines up
|
338 |
-
ESC [ n B # move cursor n lines down
|
339 |
-
ESC [ n C # move cursor n characters forward
|
340 |
-
ESC [ n D # move cursor n characters backward
|
341 |
-
|
342 |
-
# clear the screen
|
343 |
-
ESC [ mode J # clear the screen
|
344 |
-
|
345 |
-
# clear the line
|
346 |
-
ESC [ mode K # clear the line
|
347 |
-
|
348 |
-
Multiple numeric params to the ``'m'`` command can be combined into a single
|
349 |
-
sequence::
|
350 |
-
|
351 |
-
ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
|
352 |
-
|
353 |
-
All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
|
354 |
-
are silently stripped from the output on Windows.
|
355 |
-
|
356 |
-
Any other form of ANSI sequence, such as single-character codes or alternative
|
357 |
-
initial characters, are not recognised or stripped. It would be cool to add
|
358 |
-
them though. Let me know if it would be useful for you, via the Issues on
|
359 |
-
GitHub.
|
360 |
-
|
361 |
-
Status & Known Problems
|
362 |
-
-----------------------
|
363 |
-
|
364 |
-
I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
|
365 |
-
(gnome-terminal, xterm), and OS X.
|
366 |
-
|
367 |
-
Some valid ANSI sequences aren't recognised.
|
368 |
-
|
369 |
-
If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
|
370 |
-
explanation there of why we do not want PRs that allow Colorama to generate new
|
371 |
-
types of ANSI codes.
|
372 |
-
|
373 |
-
See outstanding issues and wish-list:
|
374 |
-
https://github.com/tartley/colorama/issues
|
375 |
-
|
376 |
-
If anything doesn't work for you, or doesn't do what you expected or hoped for,
|
377 |
-
I'd love to hear about it on that issues list, would be delighted by patches,
|
378 |
-
and would be happy to grant commit access to anyone who submits a working patch
|
379 |
-
or two.
|
380 |
-
|
381 |
-
.. _README-hacking.md: README-hacking.md
|
382 |
-
|
383 |
-
License
|
384 |
-
-------
|
385 |
-
|
386 |
-
Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
|
387 |
-
LICENSE file.
|
388 |
-
|
389 |
-
Professional support
|
390 |
-
--------------------
|
391 |
-
|
392 |
-
.. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
|
393 |
-
:alt: Tidelift
|
394 |
-
:target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
395 |
-
|
396 |
-
.. list-table::
|
397 |
-
:widths: 10 100
|
398 |
-
|
399 |
-
* - |tideliftlogo|
|
400 |
-
- Professional support for colorama is available as part of the
|
401 |
-
`Tidelift Subscription`_.
|
402 |
-
Tidelift gives software development teams a single source for purchasing
|
403 |
-
and maintaining their software, with professional grade assurances from
|
404 |
-
the experts who know it best, while seamlessly integrating with existing
|
405 |
-
tools.
|
406 |
-
|
407 |
-
.. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
408 |
-
|
409 |
-
Thanks
|
410 |
-
------
|
411 |
-
|
412 |
-
See the CHANGELOG for more thanks!
|
413 |
-
|
414 |
-
* Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
|
415 |
-
* Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
|
416 |
-
providing a solution to issue #7's setuptools/distutils debate,
|
417 |
-
and other fixes.
|
418 |
-
* User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
|
419 |
-
* Matthew McCormick for politely pointing out a longstanding crash on non-Win.
|
420 |
-
* Ben Hoyt, for a magnificent fix under 64-bit Windows.
|
421 |
-
* Jesse at Empty Square for submitting a fix for examples in the README.
|
422 |
-
* User 'jamessp', an observant documentation fix for cursor positioning.
|
423 |
-
* User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
|
424 |
-
fix.
|
425 |
-
* Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
|
426 |
-
* Daniel Griffith for multiple fabulous patches.
|
427 |
-
* Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
|
428 |
-
output.
|
429 |
-
* Roger Binns, for many suggestions, valuable feedback, & bug reports.
|
430 |
-
* Tim Golden for thought and much appreciated feedback on the initial idea.
|
431 |
-
* User 'Zearin' for updates to the README file.
|
432 |
-
* John Szakmeister for adding support for light colors
|
433 |
-
* Charles Merriam for adding documentation to demos
|
434 |
-
* Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
|
435 |
-
* Florian Bruhin for a fix when stdout or stderr are None
|
436 |
-
* Thomas Weininger for fixing ValueError on Windows
|
437 |
-
* Remi Rampin for better Github integration and fixes to the README file
|
438 |
-
* Simeon Visser for closing a file handle using 'with' and updating classifiers
|
439 |
-
to include Python 3.3 and 3.4
|
440 |
-
* Andy Neff for fixing RESET of LIGHT_EX colors.
|
441 |
-
* Jonathan Hartley for the initial idea and implementation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
-
colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
|
3 |
-
colorama-0.4.6.dist-info/RECORD,,
|
4 |
-
colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
|
5 |
-
colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
|
6 |
-
colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
|
7 |
-
colorama/__pycache__/__init__.cpython-312.pyc,,
|
8 |
-
colorama/__pycache__/ansi.cpython-312.pyc,,
|
9 |
-
colorama/__pycache__/ansitowin32.cpython-312.pyc,,
|
10 |
-
colorama/__pycache__/initialise.cpython-312.pyc,,
|
11 |
-
colorama/__pycache__/win32.cpython-312.pyc,,
|
12 |
-
colorama/__pycache__/winterm.cpython-312.pyc,,
|
13 |
-
colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
|
14 |
-
colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
|
15 |
-
colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
|
16 |
-
colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
|
17 |
-
colorama/tests/__pycache__/__init__.cpython-312.pyc,,
|
18 |
-
colorama/tests/__pycache__/ansi_test.cpython-312.pyc,,
|
19 |
-
colorama/tests/__pycache__/ansitowin32_test.cpython-312.pyc,,
|
20 |
-
colorama/tests/__pycache__/initialise_test.cpython-312.pyc,,
|
21 |
-
colorama/tests/__pycache__/isatty_test.cpython-312.pyc,,
|
22 |
-
colorama/tests/__pycache__/utils.cpython-312.pyc,,
|
23 |
-
colorama/tests/__pycache__/winterm_test.cpython-312.pyc,,
|
24 |
-
colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
|
25 |
-
colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
|
26 |
-
colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
|
27 |
-
colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
|
28 |
-
colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
|
29 |
-
colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
|
30 |
-
colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
|
31 |
-
colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
Wheel-Version: 1.0
|
2 |
-
Generator: hatchling 1.11.1
|
3 |
-
Root-Is-Purelib: true
|
4 |
-
Tag: py2-none-any
|
5 |
-
Tag: py3-none-any
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama-0.4.6.dist-info/licenses/LICENSE.txt
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
Copyright (c) 2010 Jonathan Hartley
|
2 |
-
All rights reserved.
|
3 |
-
|
4 |
-
Redistribution and use in source and binary forms, with or without
|
5 |
-
modification, are permitted provided that the following conditions are met:
|
6 |
-
|
7 |
-
* Redistributions of source code must retain the above copyright notice, this
|
8 |
-
list of conditions and the following disclaimer.
|
9 |
-
|
10 |
-
* Redistributions in binary form must reproduce the above copyright notice,
|
11 |
-
this list of conditions and the following disclaimer in the documentation
|
12 |
-
and/or other materials provided with the distribution.
|
13 |
-
|
14 |
-
* Neither the name of the copyright holders, nor those of its contributors
|
15 |
-
may be used to endorse or promote products derived from this software without
|
16 |
-
specific prior written permission.
|
17 |
-
|
18 |
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
19 |
-
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
20 |
-
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
21 |
-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
22 |
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
23 |
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
24 |
-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
25 |
-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
26 |
-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27 |
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/__init__.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
|
3 |
-
from .ansi import Fore, Back, Style, Cursor
|
4 |
-
from .ansitowin32 import AnsiToWin32
|
5 |
-
|
6 |
-
__version__ = '0.4.6'
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/ansi.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
'''
|
3 |
-
This module generates ANSI character codes to printing colors to terminals.
|
4 |
-
See: http://en.wikipedia.org/wiki/ANSI_escape_code
|
5 |
-
'''
|
6 |
-
|
7 |
-
CSI = '\033['
|
8 |
-
OSC = '\033]'
|
9 |
-
BEL = '\a'
|
10 |
-
|
11 |
-
|
12 |
-
def code_to_chars(code):
|
13 |
-
return CSI + str(code) + 'm'
|
14 |
-
|
15 |
-
def set_title(title):
|
16 |
-
return OSC + '2;' + title + BEL
|
17 |
-
|
18 |
-
def clear_screen(mode=2):
|
19 |
-
return CSI + str(mode) + 'J'
|
20 |
-
|
21 |
-
def clear_line(mode=2):
|
22 |
-
return CSI + str(mode) + 'K'
|
23 |
-
|
24 |
-
|
25 |
-
class AnsiCodes(object):
|
26 |
-
def __init__(self):
|
27 |
-
# the subclasses declare class attributes which are numbers.
|
28 |
-
# Upon instantiation we define instance attributes, which are the same
|
29 |
-
# as the class attributes but wrapped with the ANSI escape sequence
|
30 |
-
for name in dir(self):
|
31 |
-
if not name.startswith('_'):
|
32 |
-
value = getattr(self, name)
|
33 |
-
setattr(self, name, code_to_chars(value))
|
34 |
-
|
35 |
-
|
36 |
-
class AnsiCursor(object):
|
37 |
-
def UP(self, n=1):
|
38 |
-
return CSI + str(n) + 'A'
|
39 |
-
def DOWN(self, n=1):
|
40 |
-
return CSI + str(n) + 'B'
|
41 |
-
def FORWARD(self, n=1):
|
42 |
-
return CSI + str(n) + 'C'
|
43 |
-
def BACK(self, n=1):
|
44 |
-
return CSI + str(n) + 'D'
|
45 |
-
def POS(self, x=1, y=1):
|
46 |
-
return CSI + str(y) + ';' + str(x) + 'H'
|
47 |
-
|
48 |
-
|
49 |
-
class AnsiFore(AnsiCodes):
|
50 |
-
BLACK = 30
|
51 |
-
RED = 31
|
52 |
-
GREEN = 32
|
53 |
-
YELLOW = 33
|
54 |
-
BLUE = 34
|
55 |
-
MAGENTA = 35
|
56 |
-
CYAN = 36
|
57 |
-
WHITE = 37
|
58 |
-
RESET = 39
|
59 |
-
|
60 |
-
# These are fairly well supported, but not part of the standard.
|
61 |
-
LIGHTBLACK_EX = 90
|
62 |
-
LIGHTRED_EX = 91
|
63 |
-
LIGHTGREEN_EX = 92
|
64 |
-
LIGHTYELLOW_EX = 93
|
65 |
-
LIGHTBLUE_EX = 94
|
66 |
-
LIGHTMAGENTA_EX = 95
|
67 |
-
LIGHTCYAN_EX = 96
|
68 |
-
LIGHTWHITE_EX = 97
|
69 |
-
|
70 |
-
|
71 |
-
class AnsiBack(AnsiCodes):
|
72 |
-
BLACK = 40
|
73 |
-
RED = 41
|
74 |
-
GREEN = 42
|
75 |
-
YELLOW = 43
|
76 |
-
BLUE = 44
|
77 |
-
MAGENTA = 45
|
78 |
-
CYAN = 46
|
79 |
-
WHITE = 47
|
80 |
-
RESET = 49
|
81 |
-
|
82 |
-
# These are fairly well supported, but not part of the standard.
|
83 |
-
LIGHTBLACK_EX = 100
|
84 |
-
LIGHTRED_EX = 101
|
85 |
-
LIGHTGREEN_EX = 102
|
86 |
-
LIGHTYELLOW_EX = 103
|
87 |
-
LIGHTBLUE_EX = 104
|
88 |
-
LIGHTMAGENTA_EX = 105
|
89 |
-
LIGHTCYAN_EX = 106
|
90 |
-
LIGHTWHITE_EX = 107
|
91 |
-
|
92 |
-
|
93 |
-
class AnsiStyle(AnsiCodes):
|
94 |
-
BRIGHT = 1
|
95 |
-
DIM = 2
|
96 |
-
NORMAL = 22
|
97 |
-
RESET_ALL = 0
|
98 |
-
|
99 |
-
Fore = AnsiFore()
|
100 |
-
Back = AnsiBack()
|
101 |
-
Style = AnsiStyle()
|
102 |
-
Cursor = AnsiCursor()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/ansitowin32.py
DELETED
@@ -1,277 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
import re
|
3 |
-
import sys
|
4 |
-
import os
|
5 |
-
|
6 |
-
from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
|
7 |
-
from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
|
8 |
-
from .win32 import windll, winapi_test
|
9 |
-
|
10 |
-
|
11 |
-
winterm = None
|
12 |
-
if windll is not None:
|
13 |
-
winterm = WinTerm()
|
14 |
-
|
15 |
-
|
16 |
-
class StreamWrapper(object):
|
17 |
-
'''
|
18 |
-
Wraps a stream (such as stdout), acting as a transparent proxy for all
|
19 |
-
attribute access apart from method 'write()', which is delegated to our
|
20 |
-
Converter instance.
|
21 |
-
'''
|
22 |
-
def __init__(self, wrapped, converter):
|
23 |
-
# double-underscore everything to prevent clashes with names of
|
24 |
-
# attributes on the wrapped stream object.
|
25 |
-
self.__wrapped = wrapped
|
26 |
-
self.__convertor = converter
|
27 |
-
|
28 |
-
def __getattr__(self, name):
|
29 |
-
return getattr(self.__wrapped, name)
|
30 |
-
|
31 |
-
def __enter__(self, *args, **kwargs):
|
32 |
-
# special method lookup bypasses __getattr__/__getattribute__, see
|
33 |
-
# https://stackoverflow.com/questions/12632894/why-doesnt-getattr-work-with-exit
|
34 |
-
# thus, contextlib magic methods are not proxied via __getattr__
|
35 |
-
return self.__wrapped.__enter__(*args, **kwargs)
|
36 |
-
|
37 |
-
def __exit__(self, *args, **kwargs):
|
38 |
-
return self.__wrapped.__exit__(*args, **kwargs)
|
39 |
-
|
40 |
-
def __setstate__(self, state):
|
41 |
-
self.__dict__ = state
|
42 |
-
|
43 |
-
def __getstate__(self):
|
44 |
-
return self.__dict__
|
45 |
-
|
46 |
-
def write(self, text):
|
47 |
-
self.__convertor.write(text)
|
48 |
-
|
49 |
-
def isatty(self):
|
50 |
-
stream = self.__wrapped
|
51 |
-
if 'PYCHARM_HOSTED' in os.environ:
|
52 |
-
if stream is not None and (stream is sys.__stdout__ or stream is sys.__stderr__):
|
53 |
-
return True
|
54 |
-
try:
|
55 |
-
stream_isatty = stream.isatty
|
56 |
-
except AttributeError:
|
57 |
-
return False
|
58 |
-
else:
|
59 |
-
return stream_isatty()
|
60 |
-
|
61 |
-
@property
|
62 |
-
def closed(self):
|
63 |
-
stream = self.__wrapped
|
64 |
-
try:
|
65 |
-
return stream.closed
|
66 |
-
# AttributeError in the case that the stream doesn't support being closed
|
67 |
-
# ValueError for the case that the stream has already been detached when atexit runs
|
68 |
-
except (AttributeError, ValueError):
|
69 |
-
return True
|
70 |
-
|
71 |
-
|
72 |
-
class AnsiToWin32(object):
|
73 |
-
'''
|
74 |
-
Implements a 'write()' method which, on Windows, will strip ANSI character
|
75 |
-
sequences from the text, and if outputting to a tty, will convert them into
|
76 |
-
win32 function calls.
|
77 |
-
'''
|
78 |
-
ANSI_CSI_RE = re.compile('\001?\033\\[((?:\\d|;)*)([a-zA-Z])\002?') # Control Sequence Introducer
|
79 |
-
ANSI_OSC_RE = re.compile('\001?\033\\]([^\a]*)(\a)\002?') # Operating System Command
|
80 |
-
|
81 |
-
def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
|
82 |
-
# The wrapped stream (normally sys.stdout or sys.stderr)
|
83 |
-
self.wrapped = wrapped
|
84 |
-
|
85 |
-
# should we reset colors to defaults after every .write()
|
86 |
-
self.autoreset = autoreset
|
87 |
-
|
88 |
-
# create the proxy wrapping our output stream
|
89 |
-
self.stream = StreamWrapper(wrapped, self)
|
90 |
-
|
91 |
-
on_windows = os.name == 'nt'
|
92 |
-
# We test if the WinAPI works, because even if we are on Windows
|
93 |
-
# we may be using a terminal that doesn't support the WinAPI
|
94 |
-
# (e.g. Cygwin Terminal). In this case it's up to the terminal
|
95 |
-
# to support the ANSI codes.
|
96 |
-
conversion_supported = on_windows and winapi_test()
|
97 |
-
try:
|
98 |
-
fd = wrapped.fileno()
|
99 |
-
except Exception:
|
100 |
-
fd = -1
|
101 |
-
system_has_native_ansi = not on_windows or enable_vt_processing(fd)
|
102 |
-
have_tty = not self.stream.closed and self.stream.isatty()
|
103 |
-
need_conversion = conversion_supported and not system_has_native_ansi
|
104 |
-
|
105 |
-
# should we strip ANSI sequences from our output?
|
106 |
-
if strip is None:
|
107 |
-
strip = need_conversion or not have_tty
|
108 |
-
self.strip = strip
|
109 |
-
|
110 |
-
# should we should convert ANSI sequences into win32 calls?
|
111 |
-
if convert is None:
|
112 |
-
convert = need_conversion and have_tty
|
113 |
-
self.convert = convert
|
114 |
-
|
115 |
-
# dict of ansi codes to win32 functions and parameters
|
116 |
-
self.win32_calls = self.get_win32_calls()
|
117 |
-
|
118 |
-
# are we wrapping stderr?
|
119 |
-
self.on_stderr = self.wrapped is sys.stderr
|
120 |
-
|
121 |
-
def should_wrap(self):
|
122 |
-
'''
|
123 |
-
True if this class is actually needed. If false, then the output
|
124 |
-
stream will not be affected, nor will win32 calls be issued, so
|
125 |
-
wrapping stdout is not actually required. This will generally be
|
126 |
-
False on non-Windows platforms, unless optional functionality like
|
127 |
-
autoreset has been requested using kwargs to init()
|
128 |
-
'''
|
129 |
-
return self.convert or self.strip or self.autoreset
|
130 |
-
|
131 |
-
def get_win32_calls(self):
|
132 |
-
if self.convert and winterm:
|
133 |
-
return {
|
134 |
-
AnsiStyle.RESET_ALL: (winterm.reset_all, ),
|
135 |
-
AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
|
136 |
-
AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
|
137 |
-
AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
|
138 |
-
AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
|
139 |
-
AnsiFore.RED: (winterm.fore, WinColor.RED),
|
140 |
-
AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
|
141 |
-
AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
|
142 |
-
AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
|
143 |
-
AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
|
144 |
-
AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
|
145 |
-
AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
|
146 |
-
AnsiFore.RESET: (winterm.fore, ),
|
147 |
-
AnsiFore.LIGHTBLACK_EX: (winterm.fore, WinColor.BLACK, True),
|
148 |
-
AnsiFore.LIGHTRED_EX: (winterm.fore, WinColor.RED, True),
|
149 |
-
AnsiFore.LIGHTGREEN_EX: (winterm.fore, WinColor.GREEN, True),
|
150 |
-
AnsiFore.LIGHTYELLOW_EX: (winterm.fore, WinColor.YELLOW, True),
|
151 |
-
AnsiFore.LIGHTBLUE_EX: (winterm.fore, WinColor.BLUE, True),
|
152 |
-
AnsiFore.LIGHTMAGENTA_EX: (winterm.fore, WinColor.MAGENTA, True),
|
153 |
-
AnsiFore.LIGHTCYAN_EX: (winterm.fore, WinColor.CYAN, True),
|
154 |
-
AnsiFore.LIGHTWHITE_EX: (winterm.fore, WinColor.GREY, True),
|
155 |
-
AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
|
156 |
-
AnsiBack.RED: (winterm.back, WinColor.RED),
|
157 |
-
AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
|
158 |
-
AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
|
159 |
-
AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
|
160 |
-
AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
|
161 |
-
AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
|
162 |
-
AnsiBack.WHITE: (winterm.back, WinColor.GREY),
|
163 |
-
AnsiBack.RESET: (winterm.back, ),
|
164 |
-
AnsiBack.LIGHTBLACK_EX: (winterm.back, WinColor.BLACK, True),
|
165 |
-
AnsiBack.LIGHTRED_EX: (winterm.back, WinColor.RED, True),
|
166 |
-
AnsiBack.LIGHTGREEN_EX: (winterm.back, WinColor.GREEN, True),
|
167 |
-
AnsiBack.LIGHTYELLOW_EX: (winterm.back, WinColor.YELLOW, True),
|
168 |
-
AnsiBack.LIGHTBLUE_EX: (winterm.back, WinColor.BLUE, True),
|
169 |
-
AnsiBack.LIGHTMAGENTA_EX: (winterm.back, WinColor.MAGENTA, True),
|
170 |
-
AnsiBack.LIGHTCYAN_EX: (winterm.back, WinColor.CYAN, True),
|
171 |
-
AnsiBack.LIGHTWHITE_EX: (winterm.back, WinColor.GREY, True),
|
172 |
-
}
|
173 |
-
return dict()
|
174 |
-
|
175 |
-
def write(self, text):
|
176 |
-
if self.strip or self.convert:
|
177 |
-
self.write_and_convert(text)
|
178 |
-
else:
|
179 |
-
self.wrapped.write(text)
|
180 |
-
self.wrapped.flush()
|
181 |
-
if self.autoreset:
|
182 |
-
self.reset_all()
|
183 |
-
|
184 |
-
|
185 |
-
def reset_all(self):
|
186 |
-
if self.convert:
|
187 |
-
self.call_win32('m', (0,))
|
188 |
-
elif not self.strip and not self.stream.closed:
|
189 |
-
self.wrapped.write(Style.RESET_ALL)
|
190 |
-
|
191 |
-
|
192 |
-
def write_and_convert(self, text):
|
193 |
-
'''
|
194 |
-
Write the given text to our wrapped stream, stripping any ANSI
|
195 |
-
sequences from the text, and optionally converting them into win32
|
196 |
-
calls.
|
197 |
-
'''
|
198 |
-
cursor = 0
|
199 |
-
text = self.convert_osc(text)
|
200 |
-
for match in self.ANSI_CSI_RE.finditer(text):
|
201 |
-
start, end = match.span()
|
202 |
-
self.write_plain_text(text, cursor, start)
|
203 |
-
self.convert_ansi(*match.groups())
|
204 |
-
cursor = end
|
205 |
-
self.write_plain_text(text, cursor, len(text))
|
206 |
-
|
207 |
-
|
208 |
-
def write_plain_text(self, text, start, end):
|
209 |
-
if start < end:
|
210 |
-
self.wrapped.write(text[start:end])
|
211 |
-
self.wrapped.flush()
|
212 |
-
|
213 |
-
|
214 |
-
def convert_ansi(self, paramstring, command):
|
215 |
-
if self.convert:
|
216 |
-
params = self.extract_params(command, paramstring)
|
217 |
-
self.call_win32(command, params)
|
218 |
-
|
219 |
-
|
220 |
-
def extract_params(self, command, paramstring):
|
221 |
-
if command in 'Hf':
|
222 |
-
params = tuple(int(p) if len(p) != 0 else 1 for p in paramstring.split(';'))
|
223 |
-
while len(params) < 2:
|
224 |
-
# defaults:
|
225 |
-
params = params + (1,)
|
226 |
-
else:
|
227 |
-
params = tuple(int(p) for p in paramstring.split(';') if len(p) != 0)
|
228 |
-
if len(params) == 0:
|
229 |
-
# defaults:
|
230 |
-
if command in 'JKm':
|
231 |
-
params = (0,)
|
232 |
-
elif command in 'ABCD':
|
233 |
-
params = (1,)
|
234 |
-
|
235 |
-
return params
|
236 |
-
|
237 |
-
|
238 |
-
def call_win32(self, command, params):
|
239 |
-
if command == 'm':
|
240 |
-
for param in params:
|
241 |
-
if param in self.win32_calls:
|
242 |
-
func_args = self.win32_calls[param]
|
243 |
-
func = func_args[0]
|
244 |
-
args = func_args[1:]
|
245 |
-
kwargs = dict(on_stderr=self.on_stderr)
|
246 |
-
func(*args, **kwargs)
|
247 |
-
elif command in 'J':
|
248 |
-
winterm.erase_screen(params[0], on_stderr=self.on_stderr)
|
249 |
-
elif command in 'K':
|
250 |
-
winterm.erase_line(params[0], on_stderr=self.on_stderr)
|
251 |
-
elif command in 'Hf': # cursor position - absolute
|
252 |
-
winterm.set_cursor_position(params, on_stderr=self.on_stderr)
|
253 |
-
elif command in 'ABCD': # cursor position - relative
|
254 |
-
n = params[0]
|
255 |
-
# A - up, B - down, C - forward, D - back
|
256 |
-
x, y = {'A': (0, -n), 'B': (0, n), 'C': (n, 0), 'D': (-n, 0)}[command]
|
257 |
-
winterm.cursor_adjust(x, y, on_stderr=self.on_stderr)
|
258 |
-
|
259 |
-
|
260 |
-
def convert_osc(self, text):
|
261 |
-
for match in self.ANSI_OSC_RE.finditer(text):
|
262 |
-
start, end = match.span()
|
263 |
-
text = text[:start] + text[end:]
|
264 |
-
paramstring, command = match.groups()
|
265 |
-
if command == BEL:
|
266 |
-
if paramstring.count(";") == 1:
|
267 |
-
params = paramstring.split(";")
|
268 |
-
# 0 - change title and icon (we will only change title)
|
269 |
-
# 1 - change icon (we don't support this)
|
270 |
-
# 2 - change title
|
271 |
-
if params[0] in '02':
|
272 |
-
winterm.set_title(params[1])
|
273 |
-
return text
|
274 |
-
|
275 |
-
|
276 |
-
def flush(self):
|
277 |
-
self.wrapped.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/initialise.py
DELETED
@@ -1,121 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
import atexit
|
3 |
-
import contextlib
|
4 |
-
import sys
|
5 |
-
|
6 |
-
from .ansitowin32 import AnsiToWin32
|
7 |
-
|
8 |
-
|
9 |
-
def _wipe_internal_state_for_tests():
|
10 |
-
global orig_stdout, orig_stderr
|
11 |
-
orig_stdout = None
|
12 |
-
orig_stderr = None
|
13 |
-
|
14 |
-
global wrapped_stdout, wrapped_stderr
|
15 |
-
wrapped_stdout = None
|
16 |
-
wrapped_stderr = None
|
17 |
-
|
18 |
-
global atexit_done
|
19 |
-
atexit_done = False
|
20 |
-
|
21 |
-
global fixed_windows_console
|
22 |
-
fixed_windows_console = False
|
23 |
-
|
24 |
-
try:
|
25 |
-
# no-op if it wasn't registered
|
26 |
-
atexit.unregister(reset_all)
|
27 |
-
except AttributeError:
|
28 |
-
# python 2: no atexit.unregister. Oh well, we did our best.
|
29 |
-
pass
|
30 |
-
|
31 |
-
|
32 |
-
def reset_all():
|
33 |
-
if AnsiToWin32 is not None: # Issue #74: objects might become None at exit
|
34 |
-
AnsiToWin32(orig_stdout).reset_all()
|
35 |
-
|
36 |
-
|
37 |
-
def init(autoreset=False, convert=None, strip=None, wrap=True):
|
38 |
-
|
39 |
-
if not wrap and any([autoreset, convert, strip]):
|
40 |
-
raise ValueError('wrap=False conflicts with any other arg=True')
|
41 |
-
|
42 |
-
global wrapped_stdout, wrapped_stderr
|
43 |
-
global orig_stdout, orig_stderr
|
44 |
-
|
45 |
-
orig_stdout = sys.stdout
|
46 |
-
orig_stderr = sys.stderr
|
47 |
-
|
48 |
-
if sys.stdout is None:
|
49 |
-
wrapped_stdout = None
|
50 |
-
else:
|
51 |
-
sys.stdout = wrapped_stdout = \
|
52 |
-
wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
|
53 |
-
if sys.stderr is None:
|
54 |
-
wrapped_stderr = None
|
55 |
-
else:
|
56 |
-
sys.stderr = wrapped_stderr = \
|
57 |
-
wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
|
58 |
-
|
59 |
-
global atexit_done
|
60 |
-
if not atexit_done:
|
61 |
-
atexit.register(reset_all)
|
62 |
-
atexit_done = True
|
63 |
-
|
64 |
-
|
65 |
-
def deinit():
|
66 |
-
if orig_stdout is not None:
|
67 |
-
sys.stdout = orig_stdout
|
68 |
-
if orig_stderr is not None:
|
69 |
-
sys.stderr = orig_stderr
|
70 |
-
|
71 |
-
|
72 |
-
def just_fix_windows_console():
|
73 |
-
global fixed_windows_console
|
74 |
-
|
75 |
-
if sys.platform != "win32":
|
76 |
-
return
|
77 |
-
if fixed_windows_console:
|
78 |
-
return
|
79 |
-
if wrapped_stdout is not None or wrapped_stderr is not None:
|
80 |
-
# Someone already ran init() and it did stuff, so we won't second-guess them
|
81 |
-
return
|
82 |
-
|
83 |
-
# On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
|
84 |
-
# native ANSI support in the console as a side-effect. We only need to actually
|
85 |
-
# replace sys.stdout/stderr if we're in the old-style conversion mode.
|
86 |
-
new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
|
87 |
-
if new_stdout.convert:
|
88 |
-
sys.stdout = new_stdout
|
89 |
-
new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
|
90 |
-
if new_stderr.convert:
|
91 |
-
sys.stderr = new_stderr
|
92 |
-
|
93 |
-
fixed_windows_console = True
|
94 |
-
|
95 |
-
@contextlib.contextmanager
|
96 |
-
def colorama_text(*args, **kwargs):
|
97 |
-
init(*args, **kwargs)
|
98 |
-
try:
|
99 |
-
yield
|
100 |
-
finally:
|
101 |
-
deinit()
|
102 |
-
|
103 |
-
|
104 |
-
def reinit():
|
105 |
-
if wrapped_stdout is not None:
|
106 |
-
sys.stdout = wrapped_stdout
|
107 |
-
if wrapped_stderr is not None:
|
108 |
-
sys.stderr = wrapped_stderr
|
109 |
-
|
110 |
-
|
111 |
-
def wrap_stream(stream, convert, strip, autoreset, wrap):
|
112 |
-
if wrap:
|
113 |
-
wrapper = AnsiToWin32(stream,
|
114 |
-
convert=convert, strip=strip, autoreset=autoreset)
|
115 |
-
if wrapper.should_wrap():
|
116 |
-
stream = wrapper.stream
|
117 |
-
return stream
|
118 |
-
|
119 |
-
|
120 |
-
# Use this for initial setup as well, to reduce code duplication
|
121 |
-
_wipe_internal_state_for_tests()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/tests/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
|
|
|
env/Lib/site-packages/colorama/tests/ansi_test.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
import sys
|
3 |
-
from unittest import TestCase, main
|
4 |
-
|
5 |
-
from ..ansi import Back, Fore, Style
|
6 |
-
from ..ansitowin32 import AnsiToWin32
|
7 |
-
|
8 |
-
stdout_orig = sys.stdout
|
9 |
-
stderr_orig = sys.stderr
|
10 |
-
|
11 |
-
|
12 |
-
class AnsiTest(TestCase):
|
13 |
-
|
14 |
-
def setUp(self):
|
15 |
-
# sanity check: stdout should be a file or StringIO object.
|
16 |
-
# It will only be AnsiToWin32 if init() has previously wrapped it
|
17 |
-
self.assertNotEqual(type(sys.stdout), AnsiToWin32)
|
18 |
-
self.assertNotEqual(type(sys.stderr), AnsiToWin32)
|
19 |
-
|
20 |
-
def tearDown(self):
|
21 |
-
sys.stdout = stdout_orig
|
22 |
-
sys.stderr = stderr_orig
|
23 |
-
|
24 |
-
|
25 |
-
def testForeAttributes(self):
|
26 |
-
self.assertEqual(Fore.BLACK, '\033[30m')
|
27 |
-
self.assertEqual(Fore.RED, '\033[31m')
|
28 |
-
self.assertEqual(Fore.GREEN, '\033[32m')
|
29 |
-
self.assertEqual(Fore.YELLOW, '\033[33m')
|
30 |
-
self.assertEqual(Fore.BLUE, '\033[34m')
|
31 |
-
self.assertEqual(Fore.MAGENTA, '\033[35m')
|
32 |
-
self.assertEqual(Fore.CYAN, '\033[36m')
|
33 |
-
self.assertEqual(Fore.WHITE, '\033[37m')
|
34 |
-
self.assertEqual(Fore.RESET, '\033[39m')
|
35 |
-
|
36 |
-
# Check the light, extended versions.
|
37 |
-
self.assertEqual(Fore.LIGHTBLACK_EX, '\033[90m')
|
38 |
-
self.assertEqual(Fore.LIGHTRED_EX, '\033[91m')
|
39 |
-
self.assertEqual(Fore.LIGHTGREEN_EX, '\033[92m')
|
40 |
-
self.assertEqual(Fore.LIGHTYELLOW_EX, '\033[93m')
|
41 |
-
self.assertEqual(Fore.LIGHTBLUE_EX, '\033[94m')
|
42 |
-
self.assertEqual(Fore.LIGHTMAGENTA_EX, '\033[95m')
|
43 |
-
self.assertEqual(Fore.LIGHTCYAN_EX, '\033[96m')
|
44 |
-
self.assertEqual(Fore.LIGHTWHITE_EX, '\033[97m')
|
45 |
-
|
46 |
-
|
47 |
-
def testBackAttributes(self):
|
48 |
-
self.assertEqual(Back.BLACK, '\033[40m')
|
49 |
-
self.assertEqual(Back.RED, '\033[41m')
|
50 |
-
self.assertEqual(Back.GREEN, '\033[42m')
|
51 |
-
self.assertEqual(Back.YELLOW, '\033[43m')
|
52 |
-
self.assertEqual(Back.BLUE, '\033[44m')
|
53 |
-
self.assertEqual(Back.MAGENTA, '\033[45m')
|
54 |
-
self.assertEqual(Back.CYAN, '\033[46m')
|
55 |
-
self.assertEqual(Back.WHITE, '\033[47m')
|
56 |
-
self.assertEqual(Back.RESET, '\033[49m')
|
57 |
-
|
58 |
-
# Check the light, extended versions.
|
59 |
-
self.assertEqual(Back.LIGHTBLACK_EX, '\033[100m')
|
60 |
-
self.assertEqual(Back.LIGHTRED_EX, '\033[101m')
|
61 |
-
self.assertEqual(Back.LIGHTGREEN_EX, '\033[102m')
|
62 |
-
self.assertEqual(Back.LIGHTYELLOW_EX, '\033[103m')
|
63 |
-
self.assertEqual(Back.LIGHTBLUE_EX, '\033[104m')
|
64 |
-
self.assertEqual(Back.LIGHTMAGENTA_EX, '\033[105m')
|
65 |
-
self.assertEqual(Back.LIGHTCYAN_EX, '\033[106m')
|
66 |
-
self.assertEqual(Back.LIGHTWHITE_EX, '\033[107m')
|
67 |
-
|
68 |
-
|
69 |
-
def testStyleAttributes(self):
|
70 |
-
self.assertEqual(Style.DIM, '\033[2m')
|
71 |
-
self.assertEqual(Style.NORMAL, '\033[22m')
|
72 |
-
self.assertEqual(Style.BRIGHT, '\033[1m')
|
73 |
-
|
74 |
-
|
75 |
-
if __name__ == '__main__':
|
76 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env/Lib/site-packages/colorama/tests/ansitowin32_test.py
DELETED
@@ -1,294 +0,0 @@
|
|
1 |
-
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
2 |
-
from io import StringIO, TextIOWrapper
|
3 |
-
from unittest import TestCase, main
|
4 |
-
try:
|
5 |
-
from contextlib import ExitStack
|
6 |
-
except ImportError:
|
7 |
-
# python 2
|
8 |
-
from contextlib2 import ExitStack
|
9 |
-
|
10 |
-
try:
|
11 |
-
from unittest.mock import MagicMock, Mock, patch
|
12 |
-
except ImportError:
|
13 |
-
from mock import MagicMock, Mock, patch
|
14 |
-
|
15 |
-
from ..ansitowin32 import AnsiToWin32, StreamWrapper
|
16 |
-
from ..win32 import ENABLE_VIRTUAL_TERMINAL_PROCESSING
|
17 |
-
from .utils import osname
|
18 |
-
|
19 |
-
|
20 |
-
class StreamWrapperTest(TestCase):
|
21 |
-
|
22 |
-
def testIsAProxy(self):
|
23 |
-
mockStream = Mock()
|
24 |
-
wrapper = StreamWrapper(mockStream, None)
|
25 |
-
self.assertTrue( wrapper.random_attr is mockStream.random_attr )
|
26 |
-
|
27 |
-
def testDelegatesWrite(self):
|
28 |
-
mockStream = Mock()
|
29 |
-
mockConverter = Mock()
|
30 |
-
wrapper = StreamWrapper(mockStream, mockConverter)
|
31 |
-
wrapper.write('hello')
|
32 |
-
self.assertTrue(mockConverter.write.call_args, (('hello',), {}))
|
33 |
-
|
34 |
-
def testDelegatesContext(self):
|
35 |
-
mockConverter = Mock()
|
36 |
-
s = StringIO()
|
37 |
-
with StreamWrapper(s, mockConverter) as fp:
|
38 |
-
fp.write(u'hello')
|
39 |
-
self.assertTrue(s.closed)
|
40 |
-
|
41 |
-
def testProxyNoContextManager(self):
|
42 |
-
mockStream = MagicMock()
|
43 |
-
mockStream.__enter__.side_effect = AttributeError()
|
44 |
-
mockConverter = Mock()
|
45 |
-
with self.assertRaises(AttributeError) as excinfo:
|
46 |
-
with StreamWrapper(mockStream, mockConverter) as wrapper:
|
47 |
-
wrapper.write('hello')
|
48 |
-
|
49 |
-
def test_closed_shouldnt_raise_on_closed_stream(self):
|
50 |
-
stream = StringIO()
|
51 |
-
stream.close()
|
52 |
-
wrapper = StreamWrapper(stream, None)
|
53 |
-
self.assertEqual(wrapper.closed, True)
|
54 |
-
|
55 |
-
def test_closed_shouldnt_raise_on_detached_stream(self):
|
56 |
-
stream = TextIOWrapper(StringIO())
|
57 |
-
stream.detach()
|
58 |
-
wrapper = StreamWrapper(stream, None)
|
59 |
-
self.assertEqual(wrapper.closed, True)
|
60 |
-
|
61 |
-
class AnsiToWin32Test(TestCase):
|
62 |
-
|
63 |
-
def testInit(self):
|
64 |
-
mockStdout = Mock()
|
65 |
-
auto = Mock()
|
66 |
-
stream = AnsiToWin32(mockStdout, autoreset=auto)
|
67 |
-
self.assertEqual(stream.wrapped, mockStdout)
|
68 |
-
self.assertEqual(stream.autoreset, auto)
|
69 |
-
|
70 |
-
@patch('colorama.ansitowin32.winterm', None)
|
71 |
-
@patch('colorama.ansitowin32.winapi_test', lambda *_: True)
|
72 |
-
def testStripIsTrueOnWindows(self):
|
73 |
-
with osname('nt'):
|
74 |
-
mockStdout = Mock()
|
75 |
-
stream = AnsiToWin32(mockStdout)
|
76 |
-
self.assertTrue(stream.strip)
|
77 |
-
|
78 |
-
def testStripIsFalseOffWindows(self):
|
79 |
-
with osname('posix'):
|
80 |
-
mockStdout = Mock(closed=False)
|
81 |
-
stream = AnsiToWin32(mockStdout)
|
82 |
-
self.assertFalse(stream.strip)
|
83 |
-
|
84 |
-
def testWriteStripsAnsi(self):
|
85 |
-
mockStdout = Mock()
|
86 |
-
stream = AnsiToWin32(mockStdout)
|
87 |
-
stream.wrapped = Mock()
|
88 |
-
stream.write_and_convert = Mock()
|
89 |
-
stream.strip = True
|
90 |
-
|
91 |
-
stream.write('abc')
|
92 |
-
|
93 |
-
self.assertFalse(stream.wrapped.write.called)
|
94 |
-
self.assertEqual(stream.write_and_convert.call_args, (('abc',), {}))
|
95 |
-
|
96 |
-
def testWriteDoesNotStripAnsi(self):
|
97 |
-
mockStdout = Mock()
|
98 |
-
stream = AnsiToWin32(mockStdout)
|
99 |
-
stream.wrapped = Mock()
|
100 |
-
stream.write_and_convert = Mock()
|
101 |
-
stream.strip = False
|
102 |
-
stream.convert = False
|
103 |
-
|
104 |
-
stream.write('abc')
|
105 |
-
|
106 |
-
self.assertFalse(stream.write_and_convert.called)
|
107 |
-
self.assertEqual(stream.wrapped.write.call_args, (('abc',), {}))
|
108 |
-
|
109 |
-
def assert_autoresets(self, convert, autoreset=True):
|
110 |
-
stream = AnsiToWin32(Mock())
|
111 |
-
stream.convert = convert
|
112 |
-
stream.reset_all = Mock()
|
113 |
-
stream.autoreset = autoreset
|
114 |
-
stream.winterm = Mock()
|
115 |
-
|
116 |
-
stream.write('abc')
|
117 |
-
|
118 |
-
self.assertEqual(stream.reset_all.called, autoreset)
|
119 |
-
|
120 |
-
def testWriteAutoresets(self):
|
121 |
-
self.assert_autoresets(convert=True)
|
122 |
-
self.assert_autoresets(convert=False)
|
123 |
-
self.assert_autoresets(convert=True, autoreset=False)
|
124 |
-
self.assert_autoresets(convert=False, autoreset=False)
|
125 |
-
|
126 |
-
def testWriteAndConvertWritesPlainText(self):
|
127 |
-
stream = AnsiToWin32(Mock())
|
128 |
-
stream.write_and_convert( 'abc' )
|
129 |
-
self.assertEqual( stream.wrapped.write.call_args, (('abc',), {}) )
|
130 |
-
|
131 |
-
def testWriteAndConvertStripsAllValidAnsi(self):
|
132 |
-
stream = AnsiToWin32(Mock())
|
133 |
-
stream.call_win32 = Mock()
|
134 |
-
data = [
|
135 |
-
'abc\033[mdef',
|
136 |
-
'abc\033[0mdef',
|
137 |
-
'abc\033[2mdef',
|
138 |
-
'abc\033[02mdef',
|
139 |
-
'abc\033[002mdef',
|
140 |
-
'abc\033[40mdef',
|
141 |
-
'abc\033[040mdef',
|
142 |
-
'abc\033[0;1mdef',
|
143 |
-
'abc\033[40;50mdef',
|
144 |
-
'abc\033[50;30;40mdef',
|
145 |
-
'abc\033[Adef',
|
146 |
-
'abc\033[0Gdef',
|
147 |
-
'abc\033[1;20;128Hdef',
|
148 |
-
]
|
149 |
-
for datum in data:
|
150 |
-
stream.wrapped.write.reset_mock()
|
151 |
-
stream.write_and_convert( datum )
|
152 |
-
self.assertEqual(
|
153 |
-
[args[0] for args in stream.wrapped.write.call_args_list],
|
154 |
-
[ ('abc',), ('def',) ]
|
155 |
-
)
|
156 |
-
|
157 |
-
def testWriteAndConvertSkipsEmptySnippets(self):
|
158 |
-
stream = AnsiToWin32(Mock())
|
159 |
-
stream.call_win32 = Mock()
|
160 |
-
stream.write_and_convert( '\033[40m\033[41m' )
|
161 |
-
self.assertFalse( stream.wrapped.write.called )
|
162 |
-
|
163 |
-
def testWriteAndConvertCallsWin32WithParamsAndCommand(self):
|
164 |
-
stream = AnsiToWin32(Mock())
|
165 |
-
stream.convert = True
|
166 |
-
stream.call_win32 = Mock()
|
167 |
-
stream.extract_params = Mock(return_value='params')
|
168 |
-
data = {
|
169 |
-
'abc\033[adef': ('a', 'params'),
|
170 |
-
'abc\033[;;bdef': ('b', 'params'),
|
171 |
-
'abc\033[0cdef': ('c', 'params'),
|
172 |
-
'abc\033[;;0;;Gdef': ('G', 'params'),
|
173 |
-
'abc\033[1;20;128Hdef': ('H', 'params'),
|
174 |
-
}
|
175 |
-
for datum, expected in data.items():
|
176 |
-
stream.call_win32.reset_mock()
|
177 |
-
stream.write_and_convert( datum )
|
178 |
-
self.assertEqual( stream.call_win32.call_args[0], expected )
|
179 |
-
|
180 |
-
def test_reset_all_shouldnt_raise_on_closed_orig_stdout(self):
|
181 |
-
stream = StringIO()
|
182 |
-
converter = AnsiToWin32(stream)
|
183 |
-
stream.close()
|
184 |
-
|
185 |
-
converter.reset_all()
|
186 |
-
|
187 |
-
def test_wrap_shouldnt_raise_on_closed_orig_stdout(self):
|
188 |
-
stream = StringIO()
|
189 |
-
stream.close()
|
190 |
-
with \
|
191 |
-
patch("colorama.ansitowin32.os.name", "nt"), \
|
192 |
-
patch("colorama.ansitowin32.winapi_test", lambda: True):
|
193 |
-
converter = AnsiToWin32(stream)
|
194 |
-
self.assertTrue(converter.strip)
|
195 |
-
self.assertFalse(converter.convert)
|
196 |
-
|
197 |
-
def test_wrap_shouldnt_raise_on_missing_closed_attr(self):
|
198 |
-
with \
|
199 |
-
patch("colorama.ansitowin32.os.name", "nt"), \
|
200 |
-
patch("colorama.ansitowin32.winapi_test", lambda: True):
|
201 |
-
converter = AnsiToWin32(object())
|
202 |
-
self.assertTrue(converter.strip)
|
203 |
-
self.assertFalse(converter.convert)
|
204 |
-
|
205 |
-
def testExtractParams(self):
|
206 |
-
stream = AnsiToWin32(Mock())
|
207 |
-
data = {
|
208 |
-
'': (0,),
|
209 |
-
';;': (0,),
|
210 |
-
'2': (2,),
|
211 |
-
';;002;;': (2,),
|
212 |
-
'0;1': (0, 1),
|
213 |
-
';;003;;456;;': (3, 456),
|
214 |
-
'11;22;33;44;55': (11, 22, 33, 44, 55),
|
215 |
-
}
|
216 |
-
for datum, expected in data.items():
|
217 |
-
self.assertEqual(stream.extract_params('m', datum), expected)
|
218 |
-
|
219 |
-
def testCallWin32UsesLookup(self):
|
220 |
-
listener = Mock()
|
221 |
-
stream = AnsiToWin32(listener)
|
222 |
-
stream.win32_calls = {
|
223 |
-
1: (lambda *_, **__: listener(11),),
|
224 |
-
2: (lambda *_, **__: listener(22),),
|
225 |
-
3: (lambda *_, **__: listener(33),),
|
226 |
-
}
|
227 |
-
stream.call_win32('m', (3, 1, 99, 2))
|
228 |
-
self.assertEqual(
|
229 |
-
[a[0][0] for a in listener.call_args_list],
|
230 |
-
[33, 11, 22] )
|
231 |
-
|
232 |
-
def test_osc_codes(self):
|
233 |
-
mockStdout = Mock()
|
234 |
-
stream = AnsiToWin32(mockStdout, convert=True)
|
235 |
-
with patch('colorama.ansitowin32.winterm') as winterm:
|
236 |
-
data = [
|
237 |
-
'\033]0\x07', # missing arguments
|
238 |
-
'\033]0;foo\x08', # wrong OSC command
|
239 |
-
'\033]0;colorama_test_title\x07', # should work
|
240 |
-
'\033]1;colorama_test_title\x07', # wrong set command
|
241 |
-
'\033]2;colorama_test_title\x07', # should work
|
242 |
-
'\033]' + ';' * 64 + '\x08', # see issue #247
|
243 |
-
]
|
244 |
-
for code in data:
|
245 |
-
stream.write(code)
|
246 |
-
self.assertEqual(winterm.set_title.call_count, 2)
|
247 |
-
|
248 |
-
def test_native_windows_ansi(self):
|
249 |
-
with ExitStack() as stack:
|
250 |
-
def p(a, b):
|
251 |
-
stack.enter_context(patch(a, b, create=True))
|
252 |
-
# Pretend to be on Windows
|
253 |
-
p("colorama.ansitowin32.os.name", "nt")
|
254 |
-
p("colorama.ansitowin32.winapi_test", lambda: True)
|
255 |
-
p("colorama.win32.winapi_test", lambda: True)
|
256 |
-
p("colorama.winterm.win32.windll", "non-None")
|
257 |
-
p("colorama.winterm.get_osfhandle", lambda _: 1234)
|
258 |
-
|
259 |
-
# Pretend that our mock stream has native ANSI support
|
260 |
-
p(
|
261 |
-
"colorama.winterm.win32.GetConsoleMode",
|
262 |
-
lambda _: ENABLE_VIRTUAL_TERMINAL_PROCESSING,
|
263 |
-
)
|
264 |
-
SetConsoleMode = Mock()
|
265 |
-
p("colorama.winterm.win32.SetConsoleMode", SetConsoleMode)
|
266 |
-
|
267 |
-
stdout = Mock()
|
268 |
-
stdout.closed = False
|
269 |
-
stdout.isatty.return_value = True
|
270 |
-
stdout.fileno.return_value = 1
|
271 |
-
|
272 |
-
# Our fake console says it has native vt support, so AnsiToWin32 should
|
273 |
-
# enable that support and do nothing else.
|
274 |
-
stream = AnsiToWin32(stdout)
|
275 |
-
SetConsoleMode.assert_called_with(1234, ENABLE_VIRTUAL_TERMINAL_PROCESSING)
|
276 |
-
self.assertFalse(stream.strip)
|
277 |
-
self.assertFalse(stream.convert)
|
278 |
-
self.assertFalse(stream.should_wrap())
|
279 |
-
|
280 |
-
# Now let's pretend we're on an old Windows console, that doesn't have
|
281 |
-
# native ANSI support.
|
282 |
-
p("colorama.winterm.win32.GetConsoleMode", lambda _: 0)
|
283 |
-
SetConsoleMode = Mock()
|
284 |
-
p("colorama.winterm.win32.SetConsoleMode", SetConsoleMode)
|
285 |
-
|
286 |
-
stream = AnsiToWin32(stdout)
|
287 |
-
SetConsoleMode.assert_called_with(1234, ENABLE_VIRTUAL_TERMINAL_PROCESSING)
|
288 |
-
self.assertTrue(stream.strip)
|
289 |
-
self.assertTrue(stream.convert)
|
290 |
-
self.assertTrue(stream.should_wrap())
|
291 |
-
|
292 |
-
|
293 |
-
if __name__ == '__main__':
|
294 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|