Spaces:
Running
Running
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +4 -0
- .gitattributes +20 -35
- .gitignore +46 -0
- Dockerfile +17 -0
- app.py +112 -0
- brain_ai.py +113 -0
- code.ipynb +1758 -0
- demo.html +66 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +1 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +20 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +46 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +43 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +5 -0
- env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +2 -0
- env/Lib/site-packages/_yaml/__init__.py +33 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +1 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +20 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +77 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +14 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +5 -0
- env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +1 -0
- env/Lib/site-packages/certifi/__init__.py +4 -0
- env/Lib/site-packages/certifi/__main__.py +12 -0
- env/Lib/site-packages/certifi/cacert.pem +0 -0
- env/Lib/site-packages/certifi/core.py +114 -0
- env/Lib/site-packages/certifi/py.typed +0 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +1 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +21 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +721 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +35 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +5 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +2 -0
- env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +1 -0
- env/Lib/site-packages/charset_normalizer/__init__.py +48 -0
- env/Lib/site-packages/charset_normalizer/__main__.py +6 -0
- env/Lib/site-packages/charset_normalizer/api.py +668 -0
- env/Lib/site-packages/charset_normalizer/cd.py +395 -0
- env/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
- env/Lib/site-packages/charset_normalizer/cli/__main__.py +321 -0
- env/Lib/site-packages/charset_normalizer/constant.py +1998 -0
- env/Lib/site-packages/charset_normalizer/legacy.py +66 -0
- env/Lib/site-packages/charset_normalizer/md.py +630 -0
- env/Lib/site-packages/charset_normalizer/models.py +360 -0
- env/Lib/site-packages/charset_normalizer/py.typed +0 -0
- env/Lib/site-packages/charset_normalizer/utils.py +408 -0
- env/Lib/site-packages/charset_normalizer/version.py +8 -0
- env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +1 -0
- env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +441 -0
- env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +31 -0
- env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +5 -0
.dockerignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.git
|
3 |
+
.vscode
|
4 |
+
venv
|
.gitattributes
CHANGED
@@ -1,35 +1,20 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Auto detect text files and perform LF normalization
|
2 |
+
* text=auto
|
3 |
+
env/Lib/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
4 |
+
env/Lib/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
|
5 |
+
env/Lib/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
|
6 |
+
env/Lib/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
|
7 |
+
env/Scripts/dotenv.exe filter=lfs diff=lfs merge=lfs -text
|
8 |
+
env/Scripts/huggingface-cli.exe filter=lfs diff=lfs merge=lfs -text
|
9 |
+
env/Scripts/normalizer.exe filter=lfs diff=lfs merge=lfs -text
|
10 |
+
env/Scripts/pip.exe filter=lfs diff=lfs merge=lfs -text
|
11 |
+
env/Scripts/pip3.12.exe filter=lfs diff=lfs merge=lfs -text
|
12 |
+
env/Scripts/pip3.exe filter=lfs diff=lfs merge=lfs -text
|
13 |
+
env/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
|
14 |
+
env/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
|
15 |
+
env/Scripts/tqdm.exe filter=lfs diff=lfs merge=lfs -text
|
16 |
+
static/progress/epoch_1_batch_0.png filter=lfs diff=lfs merge=lfs -text
|
17 |
+
static/progress/epoch_2_batch_0.png filter=lfs diff=lfs merge=lfs -text
|
18 |
+
static/progress/epoch_3_batch_0.png filter=lfs diff=lfs merge=lfs -text
|
19 |
+
static/progress/epoch_4_batch_0.png filter=lfs diff=lfs merge=lfs -text
|
20 |
+
static/progress/epoch_5_batch_0.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# Virtual environment
|
7 |
+
venv/
|
8 |
+
.env
|
9 |
+
*.env
|
10 |
+
|
11 |
+
# Logs & Debugging
|
12 |
+
*.log
|
13 |
+
logs/
|
14 |
+
debug.log
|
15 |
+
|
16 |
+
# System Files
|
17 |
+
.DS_Store
|
18 |
+
Thumbs.db
|
19 |
+
|
20 |
+
# Python dependencies
|
21 |
+
Pipfile
|
22 |
+
Pipfile.lock
|
23 |
+
requirements.txt
|
24 |
+
|
25 |
+
# IDE & Editor Specific
|
26 |
+
.vscode/
|
27 |
+
.idea/
|
28 |
+
*.iml
|
29 |
+
|
30 |
+
# Compiled Python packages
|
31 |
+
*.egg
|
32 |
+
*.egg-info/
|
33 |
+
dist/
|
34 |
+
build/
|
35 |
+
|
36 |
+
# Hugging Face Cache
|
37 |
+
.huggingface/
|
38 |
+
|
39 |
+
# Node.js & Frontend files
|
40 |
+
node_modules/
|
41 |
+
npm-debug.log
|
42 |
+
yarn-error.log
|
43 |
+
|
44 |
+
# Flask Specific
|
45 |
+
instance/
|
46 |
+
config.py
|
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.9
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /BrainAI
|
6 |
+
|
7 |
+
# Copy the current directory contents into the container
|
8 |
+
COPY . /BrainAI
|
9 |
+
|
10 |
+
# Install any needed dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Expose the application port (default Flask port)
|
14 |
+
EXPOSE 5000
|
15 |
+
|
16 |
+
# Run the application
|
17 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from flask import Flask, render_template, request, redirect, url_for, jsonify
|
5 |
+
from tensorflow.keras.layers import Layer
|
6 |
+
from tensorflow.keras.models import load_model
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import tensorflow as tf
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
app.config['UPLOAD_FOLDER'] = 'static/uploads'
|
12 |
+
app.config['ALLOWED_EXTENSIONS'] = {'png', 'jpg', 'jpeg'}
|
13 |
+
|
14 |
+
class Sampling(tf.keras.layers.Layer):
|
15 |
+
def call(self, inputs):
|
16 |
+
z_mean, z_log_var = inputs
|
17 |
+
batch = tf.shape(z_mean)[0]
|
18 |
+
dim = tf.shape(z_mean)[1]
|
19 |
+
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
|
20 |
+
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
|
21 |
+
|
22 |
+
# Load models with explicit TensorFlow context
|
23 |
+
with tf.init_scope():
|
24 |
+
ct_to_mri_model = load_model('models/ct_to_mri_epoch_39.h5',
|
25 |
+
custom_objects={'Sampling': Sampling})
|
26 |
+
mri_to_ct_model = load_model('models/mri_to_ct_epoch_39.h5',
|
27 |
+
custom_objects={'Sampling': Sampling})
|
28 |
+
|
29 |
+
def allowed_file(filename):
|
30 |
+
return '.' in filename and \
|
31 |
+
filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
|
32 |
+
|
33 |
+
def process_image(image_path, model):
|
34 |
+
img = cv2.imread(image_path)
|
35 |
+
if img is None:
|
36 |
+
raise ValueError("Could not load image")
|
37 |
+
|
38 |
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
39 |
+
img = cv2.resize(img, (256, 256))
|
40 |
+
img = img.astype(np.float32) / 255.0
|
41 |
+
img = np.expand_dims(img, axis=0)
|
42 |
+
|
43 |
+
prediction = model.predict(img)
|
44 |
+
if isinstance(prediction, (list, tuple)):
|
45 |
+
prediction = prediction[0]
|
46 |
+
|
47 |
+
prediction = np.squeeze(prediction)
|
48 |
+
prediction = (prediction * 255).astype(np.uint8)
|
49 |
+
return prediction
|
50 |
+
|
51 |
+
def clean_uploads():
|
52 |
+
upload_dir = app.config['UPLOAD_FOLDER']
|
53 |
+
for filename in os.listdir(upload_dir):
|
54 |
+
file_path = os.path.join(upload_dir, filename)
|
55 |
+
try:
|
56 |
+
if os.path.isfile(file_path):
|
57 |
+
os.unlink(file_path)
|
58 |
+
except Exception as e:
|
59 |
+
print(f'Error deleting {file_path}: {e}')
|
60 |
+
|
61 |
+
@app.route('/')
|
62 |
+
def index():
|
63 |
+
return render_template('index.html')
|
64 |
+
|
65 |
+
@app.route('/try_now')
|
66 |
+
def try_now():
|
67 |
+
return render_template('try_now.html')
|
68 |
+
|
69 |
+
@app.route('/samples')
|
70 |
+
def samples():
|
71 |
+
return render_template('samples.html')
|
72 |
+
|
73 |
+
@app.route('/model_info')
|
74 |
+
def model_info():
|
75 |
+
return render_template('model_info.html')
|
76 |
+
|
77 |
+
@app.route('/translate', methods=['POST'])
|
78 |
+
def translate():
|
79 |
+
clean_uploads() # Clean previous uploads
|
80 |
+
|
81 |
+
if 'file' not in request.files:
|
82 |
+
return redirect(request.url)
|
83 |
+
|
84 |
+
file = request.files['file']
|
85 |
+
if file.filename == '':
|
86 |
+
return redirect(request.url)
|
87 |
+
|
88 |
+
if file and allowed_file(file.filename):
|
89 |
+
# Save original image
|
90 |
+
upload_path = os.path.join(app.config['UPLOAD_FOLDER'], 'original.png')
|
91 |
+
file.save(upload_path)
|
92 |
+
|
93 |
+
# Choose model based on translation direction
|
94 |
+
direction = request.form.get('direction')
|
95 |
+
model = ct_to_mri_model if direction == 'ct_to_mri' else mri_to_ct_model
|
96 |
+
|
97 |
+
try:
|
98 |
+
result = process_image(upload_path, model)
|
99 |
+
result_path = os.path.join(app.config['UPLOAD_FOLDER'], 'result.png')
|
100 |
+
plt.imsave(result_path, result)
|
101 |
+
|
102 |
+
return render_template('result.html',
|
103 |
+
original=url_for('static', filename='uploads/original.png'),
|
104 |
+
result=url_for('static', filename='uploads/result.png'))
|
105 |
+
except Exception as e:
|
106 |
+
return f"Error processing image: {str(e)}"
|
107 |
+
|
108 |
+
return redirect(url_for('try_now'))
|
109 |
+
|
110 |
+
if __name__ == '__main__':
|
111 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
112 |
+
app.run(host='0.0.0.0', port=5000, debug=False) # Set debug=False for production
|
brain_ai.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
from langchain_ollama import ChatOllama
|
4 |
+
from langchain_core.output_parsers import StrOutputParser
|
5 |
+
from langchain_core.prompts import (
|
6 |
+
SystemMessagePromptTemplate,
|
7 |
+
HumanMessagePromptTemplate,
|
8 |
+
AIMessagePromptTemplate,
|
9 |
+
ChatPromptTemplate
|
10 |
+
)
|
11 |
+
|
12 |
+
st.title("🧠 BrainAI")
|
13 |
+
st.caption("🚀 Your own AI Neurologist with SuperPowers!!")
|
14 |
+
|
15 |
+
# Common user query suggestions
|
16 |
+
suggestions = [
|
17 |
+
"What are the early symptoms of a brain tumor?",
|
18 |
+
"How is a brain tumor diagnosed?",
|
19 |
+
"What are the treatment options for brain tumors?",
|
20 |
+
"Can a brain tumor be non-cancerous?",
|
21 |
+
"What lifestyle changes can help manage brain tumors?"
|
22 |
+
]
|
23 |
+
|
24 |
+
# Display suggestions in rows and keep them fixed at the top
|
25 |
+
# st.write("### 💡 Common Questions")
|
26 |
+
suggestion_container = st.container()
|
27 |
+
with suggestion_container:
|
28 |
+
for query in suggestions:
|
29 |
+
if st.button(query, key=query):
|
30 |
+
st.session_state["user_input"] = query
|
31 |
+
st.rerun()
|
32 |
+
|
33 |
+
# Initiate chat engine
|
34 |
+
llm_engine = ChatOllama(
|
35 |
+
model="deepseek-r1:1.5b",
|
36 |
+
base_url="http://localhost:11434",
|
37 |
+
temperature=0.3
|
38 |
+
)
|
39 |
+
|
40 |
+
# System prompt
|
41 |
+
system_prompt = SystemMessagePromptTemplate.from_template("""
|
42 |
+
You are BrainAI, an AI-powered neurologist assistant designed to provide non-emergency guidance, education,
|
43 |
+
and support for neurological health. Your expertise includes brain anatomy, neurological disorders (e.g.,
|
44 |
+
epilepsy, Alzheimer’s, brain tumors, migraines), symptoms, diagnostics, and general brain health tips.
|
45 |
+
Always prioritize ethical guidelines, clarify your limitations, and emphasize consulting a licensed professional
|
46 |
+
for personal care. Answer only in English language.
|
47 |
+
""")
|
48 |
+
|
49 |
+
# Session management
|
50 |
+
if "message_log" not in st.session_state:
|
51 |
+
st.session_state.message_log = [{"role": "assistant", "content": "Hello! How can I assist you with brain health today?"}]
|
52 |
+
|
53 |
+
# Chat container
|
54 |
+
chat_container = st.container()
|
55 |
+
|
56 |
+
# Display messages with animation
|
57 |
+
def display_text_with_animation(text):
|
58 |
+
message_placeholder = st.empty()
|
59 |
+
displayed_text = ""
|
60 |
+
for char in text:
|
61 |
+
displayed_text += char
|
62 |
+
message_placeholder.markdown(displayed_text)
|
63 |
+
time.sleep(0.01)
|
64 |
+
|
65 |
+
with chat_container:
|
66 |
+
for message in st.session_state.message_log:
|
67 |
+
with st.chat_message(message["role"]):
|
68 |
+
if "<think>" in message["content"]:
|
69 |
+
parts = message["content"].split("</think>")
|
70 |
+
think_content = parts[0].replace("<think>", "").strip()
|
71 |
+
actual_response = parts[-1].strip()
|
72 |
+
|
73 |
+
with st.expander("🔍 View AI's Thinking Process"):
|
74 |
+
st.markdown(f"*Internal Analysis:*\n{think_content}")
|
75 |
+
|
76 |
+
display_text_with_animation(actual_response)
|
77 |
+
else:
|
78 |
+
display_text_with_animation(message["content"])
|
79 |
+
|
80 |
+
# Chat input
|
81 |
+
user_query = st.chat_input(" Ask anything about brain health ...")
|
82 |
+
|
83 |
+
# If a suggestion was selected, use it as the input
|
84 |
+
if "user_input" in st.session_state:
|
85 |
+
user_query = st.session_state["user_input"]
|
86 |
+
del st.session_state["user_input"]
|
87 |
+
|
88 |
+
def generate_ai_response(prompt_chain):
|
89 |
+
processing_pipeline = prompt_chain | llm_engine | StrOutputParser()
|
90 |
+
return processing_pipeline.invoke({})
|
91 |
+
|
92 |
+
def build_prompt_chain():
|
93 |
+
prompt_sequence = [system_prompt]
|
94 |
+
for msg in st.session_state.message_log:
|
95 |
+
if msg["role"] == "user":
|
96 |
+
prompt_sequence.append(HumanMessagePromptTemplate.from_template(msg["content"]))
|
97 |
+
elif msg["role"] == "assistant":
|
98 |
+
prompt_sequence.append(AIMessagePromptTemplate.from_template(msg["content"]))
|
99 |
+
return ChatPromptTemplate.from_messages(prompt_sequence)
|
100 |
+
|
101 |
+
if user_query:
|
102 |
+
st.session_state.message_log.append({"role": "user", "content": user_query})
|
103 |
+
|
104 |
+
with st.spinner("🧠 Thinking ..."):
|
105 |
+
prompt_chain = build_prompt_chain()
|
106 |
+
raw_response = generate_ai_response(prompt_chain)
|
107 |
+
|
108 |
+
st.session_state.message_log.append({
|
109 |
+
"role": "assistant",
|
110 |
+
"content": raw_response
|
111 |
+
})
|
112 |
+
|
113 |
+
st.rerun()
|
code.ipynb
ADDED
@@ -0,0 +1,1758 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"*#Image to Image Translation#*"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 1,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"name": "stdout",
|
17 |
+
"output_type": "stream",
|
18 |
+
"text": [
|
19 |
+
"Requirement already satisfied: tensorflow==2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (2.15.0)\n",
|
20 |
+
"Requirement already satisfied: tensorflow-intel==2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow==2.15.0) (2.15.0)\n",
|
21 |
+
"Requirement already satisfied: grpcio<2.0,>=1.24.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.70.0)\n",
|
22 |
+
"Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.31.0)\n",
|
23 |
+
"Requirement already satisfied: numpy<2.0.0,>=1.23.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.26.4)\n",
|
24 |
+
"Requirement already satisfied: packaging in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (24.2)\n",
|
25 |
+
"Requirement already satisfied: tensorflow-estimator<2.16,>=2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.0)\n",
|
26 |
+
"Requirement already satisfied: libclang>=13.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (18.1.1)\n",
|
27 |
+
"Requirement already satisfied: six>=1.12.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.17.0)\n",
|
28 |
+
"Requirement already satisfied: absl-py>=1.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.1.0)\n",
|
29 |
+
"Requirement already satisfied: setuptools in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (65.5.0)\n",
|
30 |
+
"Requirement already satisfied: h5py>=2.9.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.12.1)\n",
|
31 |
+
"Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.6.0)\n",
|
32 |
+
"Requirement already satisfied: flatbuffers>=23.5.26 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (25.1.24)\n",
|
33 |
+
"Requirement already satisfied: google-pasta>=0.1.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.2.0)\n",
|
34 |
+
"Requirement already satisfied: termcolor>=1.1.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.5.0)\n",
|
35 |
+
"Requirement already satisfied: astunparse>=1.6.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.6.3)\n",
|
36 |
+
"Requirement already satisfied: typing-extensions>=3.6.6 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (4.12.2)\n",
|
37 |
+
"Requirement already satisfied: tensorboard<2.16,>=2.15 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.2)\n",
|
38 |
+
"Requirement already satisfied: keras<2.16,>=2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.0)\n",
|
39 |
+
"Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.20.3)\n",
|
40 |
+
"Requirement already satisfied: ml-dtypes~=0.2.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.2.0)\n",
|
41 |
+
"Requirement already satisfied: wrapt<1.15,>=1.11.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.14.1)\n",
|
42 |
+
"Requirement already satisfied: opt-einsum>=2.3.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.4.0)\n",
|
43 |
+
"Requirement already satisfied: wheel<1.0,>=0.23.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from astunparse>=1.6.0->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.45.1)\n",
|
44 |
+
"Requirement already satisfied: requests<3,>=2.21.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.32.3)\n",
|
45 |
+
"Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.7.2)\n",
|
46 |
+
"Requirement already satisfied: werkzeug>=1.0.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.1.3)\n",
|
47 |
+
"Requirement already satisfied: markdown>=2.6.8 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.7)\n",
|
48 |
+
"Requirement already satisfied: google-auth<3,>=1.6.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.38.0)\n",
|
49 |
+
"Requirement already satisfied: google-auth-oauthlib<2,>=0.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.2.1)\n",
|
50 |
+
"Requirement already satisfied: cachetools<6.0,>=2.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (5.5.1)\n",
|
51 |
+
"Requirement already satisfied: pyasn1-modules>=0.2.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.4.1)\n",
|
52 |
+
"Requirement already satisfied: rsa<5,>=3.1.4 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (4.9)\n",
|
53 |
+
"Requirement already satisfied: requests-oauthlib>=0.7.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.0.0)\n",
|
54 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.4.1)\n",
|
55 |
+
"Requirement already satisfied: idna<4,>=2.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.10)\n",
|
56 |
+
"Requirement already satisfied: certifi>=2017.4.17 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2025.1.31)\n",
|
57 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.3.0)\n",
|
58 |
+
"Requirement already satisfied: MarkupSafe>=2.1.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from werkzeug>=1.0.1->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.0.2)\n",
|
59 |
+
"Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.6.1)\n",
|
60 |
+
"Requirement already satisfied: oauthlib>=3.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.2.2)\n"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"name": "stderr",
|
65 |
+
"output_type": "stream",
|
66 |
+
"text": [
|
67 |
+
"\n",
|
68 |
+
"[notice] A new release of pip is available: 23.0.1 -> 25.0.1\n",
|
69 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
70 |
+
]
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"source": [
|
74 |
+
"!pip install tensorflow==2.15.0\n"
|
75 |
+
]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"cell_type": "code",
|
79 |
+
"execution_count": 2,
|
80 |
+
"metadata": {},
|
81 |
+
"outputs": [
|
82 |
+
{
|
83 |
+
"name": "stdout",
|
84 |
+
"output_type": "stream",
|
85 |
+
"text": [
|
86 |
+
"Requirement already satisfied: tensorflow-probability==0.23.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (0.23.0)\n",
|
87 |
+
"Requirement already satisfied: six>=1.10.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (1.17.0)\n",
|
88 |
+
"Requirement already satisfied: decorator in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (5.1.1)\n",
|
89 |
+
"Requirement already satisfied: cloudpickle>=1.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (3.1.1)\n",
|
90 |
+
"Requirement already satisfied: gast>=0.3.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (0.6.0)\n",
|
91 |
+
"Requirement already satisfied: numpy>=1.13.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (1.26.4)\n",
|
92 |
+
"Requirement already satisfied: absl-py in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (2.1.0)\n",
|
93 |
+
"Requirement already satisfied: dm-tree in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (0.1.9)\n",
|
94 |
+
"Requirement already satisfied: attrs>=18.2.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from dm-tree->tensorflow-probability==0.23.0) (25.1.0)\n",
|
95 |
+
"Requirement already satisfied: wrapt>=1.11.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from dm-tree->tensorflow-probability==0.23.0) (1.14.1)\n"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"name": "stderr",
|
100 |
+
"output_type": "stream",
|
101 |
+
"text": [
|
102 |
+
"\n",
|
103 |
+
"[notice] A new release of pip is available: 23.0.1 -> 25.0.1\n",
|
104 |
+
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
|
105 |
+
]
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"source": [
|
109 |
+
"!pip install tensorflow-probability==0.23.0"
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "markdown",
|
114 |
+
"metadata": {},
|
115 |
+
"source": [
|
116 |
+
"*1️⃣ Import Necessary Libraries*\n",
|
117 |
+
"\n",
|
118 |
+
"1.*TensorFlow/Keras* for building and training deep learning models.\n",
|
119 |
+
"\n",
|
120 |
+
"2.*NumPy* for numerical operations/n.\n",
|
121 |
+
"\n",
|
122 |
+
"3.*Matplotlib* for visualizing the results.\n",
|
123 |
+
"\n",
|
124 |
+
"4.*OpenCV/PIL* for image processing.\n",
|
125 |
+
"\n",
|
126 |
+
"5.*TensorFlow* Addons for additional loss functions and layers (e.g., InstanceNorm).\n",
|
127 |
+
"\n",
|
128 |
+
"6.*TensorFlow Datasets* (or custom loaders) to load CT & MRI images."
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"execution_count": 3,
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [
|
136 |
+
{
|
137 |
+
"name": "stdout",
|
138 |
+
"output_type": "stream",
|
139 |
+
"text": [
|
140 |
+
"WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
|
141 |
+
"\n",
|
142 |
+
"WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\tensorflow_probability\\python\\internal\\backend\\numpy\\_utils.py:48: The name tf.logging.TaskLevelStatusMessage is deprecated. Please use tf.compat.v1.logging.TaskLevelStatusMessage instead.\n",
|
143 |
+
"\n",
|
144 |
+
"WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\tensorflow_probability\\python\\internal\\backend\\numpy\\_utils.py:48: The name tf.control_flow_v2_enabled is deprecated. Please use tf.compat.v1.control_flow_v2_enabled instead.\n",
|
145 |
+
"\n"
|
146 |
+
]
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"source": [
|
150 |
+
"import tensorflow as tf\n",
|
151 |
+
"from tensorflow.keras import layers, Model\n",
|
152 |
+
"import numpy as np\n",
|
153 |
+
"import cv2\n",
|
154 |
+
"import pathlib\n",
|
155 |
+
"import matplotlib.pyplot as plt\n",
|
156 |
+
"import tensorflow_probability as tfp\n",
|
157 |
+
"\n",
|
158 |
+
"tfd = tfp.distributions"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "markdown",
|
163 |
+
"metadata": {},
|
164 |
+
"source": [
|
165 |
+
"2️⃣ *Configuration (Hyperparameters)*\n",
|
166 |
+
"\n",
|
167 |
+
"This step defines the key settings for training.\n",
|
168 |
+
"\n",
|
169 |
+
"\n",
|
170 |
+
"Image size: The input image dimensions.\n",
|
171 |
+
"\n",
|
172 |
+
"Latent dimension: The size of the encoded representation in the VAE.\n",
|
173 |
+
"\n",
|
174 |
+
"Learning rate: Defines how fast the model updates weights.\n",
|
175 |
+
"\n",
|
176 |
+
"Batch size & epochs: Training parameters."
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 4,
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [],
|
184 |
+
"source": [
|
185 |
+
"IMAGE_SHAPE = (256, 256, 3)\n",
|
186 |
+
"LATENT_DIM = 256\n",
|
187 |
+
"FILTERS = 16\n",
|
188 |
+
"KERNEL = 3\n",
|
189 |
+
"LEARNING_RATE = 0.0001\n",
|
190 |
+
"WEIGHT_DECAY = 6e-8\n",
|
191 |
+
"BATCH_SIZE = 1\n",
|
192 |
+
"EPOCHS = 10"
|
193 |
+
]
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"cell_type": "markdown",
|
197 |
+
"metadata": {},
|
198 |
+
"source": [
|
199 |
+
"* ===================== Architecture Components =====================*"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"cell_type": "markdown",
|
204 |
+
"metadata": {},
|
205 |
+
"source": [
|
206 |
+
"*3️⃣ Sampling Layer for Variational Autoencoder (VAE)*\n",
|
207 |
+
"\n",
|
208 |
+
"The sampling layer is a crucial part of the VAE, where we sample from the latent space.\n",
|
209 |
+
"\n",
|
210 |
+
"🔹 What We Need \n",
|
211 |
+
"\n",
|
212 |
+
"The encoder outputs μ (mean) and σ (log variance).\n",
|
213 |
+
"\n",
|
214 |
+
"This layer samples from a normal distribution using the reparameterization trick."
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": null,
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [],
|
222 |
+
"source": [
|
223 |
+
"class Sampling(layers.Layer):\n",
|
224 |
+
" def call(self, inputs):\n",
|
225 |
+
" z_mean, z_log_var = inputs\n",
|
226 |
+
" batch = tf.shape(z_mean)[0]\n",
|
227 |
+
" dim = tf.shape(z_mean)[1]\n",
|
228 |
+
" epsilon = tf.random.normal(shape=(batch, dim))\n",
|
229 |
+
" return z_mean + tf.exp(0.5 * z_log_var) * epsilon"
|
230 |
+
]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "markdown",
|
234 |
+
"metadata": {},
|
235 |
+
"source": [
|
236 |
+
"It inherits from layers.Layer, meaning it's a custom layer that can be used like any other Keras layer.\n",
|
237 |
+
"\n",
|
238 |
+
"📍inputs is a tuple containing:\n",
|
239 |
+
"\n",
|
240 |
+
" z_mean: The mean vector output from the encoder.\n",
|
241 |
+
"\n",
|
242 |
+
" z_log_var: The log variance vector output from the encoder.\n",
|
243 |
+
"\n",
|
244 |
+
"📍This unpacks the inputs into two separate variables:\n",
|
245 |
+
"\n",
|
246 |
+
" z_mean: Represents the mean of the latent distribution.\n",
|
247 |
+
"\n",
|
248 |
+
" z_log_var: Represents the log variance of the latent distribution.\n",
|
249 |
+
"\n",
|
250 |
+
"📍Why log variance?\n",
|
251 |
+
"\n",
|
252 |
+
"Instead of using variance (σ²), we use log(σ²) because:\n",
|
253 |
+
"\n",
|
254 |
+
"Numerical Stability: Log prevents exploding/vanishing gradients.\n",
|
255 |
+
"\n",
|
256 |
+
"Easier Optimization: exp(log(σ²) / 2) makes variance always positive.\n",
|
257 |
+
"\n",
|
258 |
+
"\n",
|
259 |
+
"📍This determines:\n",
|
260 |
+
" batch: The number of samples in the batch.\n",
|
261 |
+
" dim: The size of the latent space (e.g., 128 if LATENT_DIM = 128).\n",
|
262 |
+
"\n",
|
263 |
+
"\n",
|
264 |
+
"📍Generates random values from a standard normal distribution (𝒩(0,1)).\n",
|
265 |
+
"\n",
|
266 |
+
"epsilon.shape = (batch, dim), meaning every sample gets a unique noise vector.\n",
|
267 |
+
"\n",
|
268 |
+
"Why do we need epsilon?\n",
|
269 |
+
"\n",
|
270 |
+
"Instead of directly using z_mean, we add controlled randomness to ensure the VAE learns a smooth latent space.\n",
|
271 |
+
"\n",
|
272 |
+
"* Reparameterization Trick*\n",
|
273 |
+
"\n",
|
274 |
+
" The latent space follows a normal distribution:\n",
|
275 |
+
"\n",
|
276 |
+
" 𝑧 ∼ 𝒩(μ, σ²)\n",
|
277 |
+
"\n",
|
278 |
+
" A sample is drawn from this distribution:\n",
|
279 |
+
"\n",
|
280 |
+
" 𝑧 = μ + σ * ε, where ε ∼ 𝒩(0,1).\n",
|
281 |
+
"\n",
|
282 |
+
" Since z_log_var = log(σ²), we compute:\n",
|
283 |
+
"\n",
|
284 |
+
" σ = exp(0.5 * log(σ²)) = exp(0.5 * z_log_var).\n",
|
285 |
+
"\n",
|
286 |
+
"\n",
|
287 |
+
"\n"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"cell_type": "markdown",
|
292 |
+
"metadata": {},
|
293 |
+
"source": [
|
294 |
+
"*🔹 Residual Block in Detail*\n",
|
295 |
+
"\n",
|
296 |
+
"This function defines a residual block, a key building block inspired by ResNet (Residual Networks). Residual blocks help in training deep neural networks efficiently by allowing gradient flow through skip connections.\n",
|
297 |
+
"\n",
|
298 |
+
"inputs: The input tensor (features from the previous layer).\n",
|
299 |
+
"\n",
|
300 |
+
"filters: The number of filters (channels) in the convolution layers.\n",
|
301 |
+
"\n",
|
302 |
+
"use_norm: Whether to apply Group Normalization (helps stabilize training)\n",
|
303 |
+
"\n",
|
304 |
+
"Step 1️⃣: First Convolution + Activation :\n",
|
305 |
+
"\n",
|
306 |
+
" Applies a 2D Convolution (Conv2D) with filters filters.\n",
|
307 |
+
"\n",
|
308 |
+
" KERNEL (not defined in this function) should be the kernel size (e.g., 3x3 or 5x5).\n",
|
309 |
+
"\n",
|
310 |
+
" padding='same': Ensures the output size is the same as the input.\n",
|
311 |
+
"\n",
|
312 |
+
" Leaky ReLU activation (alpha=0.2):\n",
|
313 |
+
"\n",
|
314 |
+
" Helps avoid dead neurons (better than regular ReLU).\n",
|
315 |
+
" \n",
|
316 |
+
" Allows a small gradient flow for negative values.\n",
|
317 |
+
"\n",
|
318 |
+
"Step 2️⃣: Group Normalization (Optional)\n",
|
319 |
+
"\n",
|
320 |
+
"Step 3️⃣: Second Convolution + Activation\n",
|
321 |
+
"\n",
|
322 |
+
" Applies another Conv2D layer with the same number of filters.\n",
|
323 |
+
"\n",
|
324 |
+
" Uses LeakyReLU again for better gradient flow.\n",
|
325 |
+
"\n",
|
326 |
+
" Why two convolutions?\n",
|
327 |
+
"\n",
|
328 |
+
" The first convolution learns low-level features.\n",
|
329 |
+
" \n",
|
330 |
+
" The second convolution refines the learned features.\n",
|
331 |
+
"\n",
|
332 |
+
"\n",
|
333 |
+
"Step 5️⃣: Shortcut Connection (Skip Connection)\n",
|
334 |
+
"\n",
|
335 |
+
" The original input is passed through a 1x1 convolution.\n",
|
336 |
+
"\n",
|
337 |
+
" This matches the number of filters with the residual output.\n",
|
338 |
+
"\n",
|
339 |
+
" Why 1x1 convolution?\n",
|
340 |
+
"\n",
|
341 |
+
" Ensures the shortcut has the same number of filters as x.\n",
|
342 |
+
"\n",
|
343 |
+
" Helps in adjusting dimensions when the number of channels changes.\n",
|
344 |
+
"\n",
|
345 |
+
"Step 6️⃣: Merge Shortcut & Residual Path\n",
|
346 |
+
"\n",
|
347 |
+
" Merges the shortcut and residual path using element-wise maximum.\n",
|
348 |
+
" \n",
|
349 |
+
" Why maximum() instead of addition (+)?\n",
|
350 |
+
"\n",
|
351 |
+
" Prevents negative values, which can help improve training stability.\n",
|
352 |
+
" \n",
|
353 |
+
" Focuses on stronger features from either the residual or shortcut path.\n",
|
354 |
+
"\n",
|
355 |
+
"\n",
|
356 |
+
"\n",
|
357 |
+
"\n",
|
358 |
+
"\n",
|
359 |
+
"\n",
|
360 |
+
"\n",
|
361 |
+
"\n"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": 5,
|
367 |
+
"metadata": {},
|
368 |
+
"outputs": [],
|
369 |
+
"source": [
|
370 |
+
"def residual_block(inputs, filters, use_norm=True):\n",
|
371 |
+
" x = layers.Conv2D(filters, KERNEL, padding='same')(inputs)\n",
|
372 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
373 |
+
" if use_norm:\n",
|
374 |
+
" x = layers.GroupNormalization(groups=1)(x)\n",
|
375 |
+
" x = layers.Conv2D(filters, KERNEL, padding='same')(x)\n",
|
376 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
377 |
+
" if use_norm:\n",
|
378 |
+
" x = layers.GroupNormalization(groups=1)(x)\n",
|
379 |
+
" shortcut = layers.Conv2D(filters, 1, padding='same')(inputs)\n",
|
380 |
+
" return layers.maximum([x, shortcut])"
|
381 |
+
]
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"cell_type": "markdown",
|
385 |
+
"metadata": {},
|
386 |
+
"source": [
|
387 |
+
"* 1️⃣ Encoder and Decoder Block*\n",
|
388 |
+
"\n",
|
389 |
+
"*Encoder*\n",
|
390 |
+
"\n",
|
391 |
+
"1️⃣ Pass Input Through Residual Block\n",
|
392 |
+
"\n",
|
393 |
+
"Uses a residual block (previously defined).\n",
|
394 |
+
"\n",
|
395 |
+
"Extracts important features while keeping the original information.\n",
|
396 |
+
"\n",
|
397 |
+
"Helps prevent vanishing gradients and allows deep networks to train effectively.\n",
|
398 |
+
"\n",
|
399 |
+
"2️⃣ Store the Skip Connection\n",
|
400 |
+
"\n",
|
401 |
+
"The skip connection stores the output of the residual block.\n",
|
402 |
+
"\n",
|
403 |
+
"It will be used later in the decoder to restore lost details.\n",
|
404 |
+
"\n",
|
405 |
+
"3️⃣ Downsampling (Reduce Spatial Size)\n",
|
406 |
+
"\n",
|
407 |
+
"Applies Max Pooling to reduce the spatial size (height & width).\n",
|
408 |
+
"\n",
|
409 |
+
"Why?\n",
|
410 |
+
"\n",
|
411 |
+
"Reduces computation.\n",
|
412 |
+
"\n",
|
413 |
+
"Forces the model to learn high-level features instead of pixel details.\n",
|
414 |
+
"\n",
|
415 |
+
"4️⃣ Return Downsampled Output & Skip Connection\n",
|
416 |
+
"\n",
|
417 |
+
"Outputs:\n",
|
418 |
+
"\n",
|
419 |
+
"x: The downsampled feature map.\n",
|
420 |
+
"\n",
|
421 |
+
"skip: The saved feature map (used later in the decoder).\n",
|
422 |
+
"\n",
|
423 |
+
"\n",
|
424 |
+
"🔥 2️⃣ Decoder Block\n",
|
425 |
+
"\n",
|
426 |
+
"1️⃣ Upsampling (Increase Spatial Size):\n",
|
427 |
+
"\n",
|
428 |
+
"Uses Conv2DTranspose (transposed convolution, aka deconvolution).\n",
|
429 |
+
"\n",
|
430 |
+
"Upsamples the input by a factor of 2 (increases spatial size).\n",
|
431 |
+
"\n",
|
432 |
+
"Why?\n",
|
433 |
+
"\n",
|
434 |
+
"Increases resolution to match the original input image.\n",
|
435 |
+
"\n",
|
436 |
+
"2️⃣ Merge Skip Connection\n",
|
437 |
+
"\n",
|
438 |
+
"Combines the upsampled output with the skip connection.\n",
|
439 |
+
"\n",
|
440 |
+
"Uses element-wise maximum instead of addition.\n",
|
441 |
+
"\n",
|
442 |
+
"Why?\n",
|
443 |
+
"\n",
|
444 |
+
"Ensures the model focuses on the most important features.\n",
|
445 |
+
"\n",
|
446 |
+
"Prevents loss of key information during encoding.\n",
|
447 |
+
"\n",
|
448 |
+
"3️⃣ Apply a Residual Block\n",
|
449 |
+
"\n",
|
450 |
+
"Uses a residual block to refine the upsampled output.\n",
|
451 |
+
"\n",
|
452 |
+
"Helps recover lost details and maintain stability.\n",
|
453 |
+
"\n",
|
454 |
+
"4️⃣ Return the Processed Output\n",
|
455 |
+
"\n",
|
456 |
+
"Returns the final feature map after upsampling and refinement.\n",
|
457 |
+
"\n",
|
458 |
+
"\n"
|
459 |
+
]
|
460 |
+
},
|
461 |
+
{
|
462 |
+
"cell_type": "code",
|
463 |
+
"execution_count": 6,
|
464 |
+
"metadata": {},
|
465 |
+
"outputs": [],
|
466 |
+
"source": [
|
467 |
+
"def encoder_block(inputs, filters, use_norm=True):\n",
|
468 |
+
" x = residual_block(inputs, filters, use_norm)\n",
|
469 |
+
" skip = x\n",
|
470 |
+
" x = layers.MaxPooling2D()(x)\n",
|
471 |
+
" return x, skip\n",
|
472 |
+
"\n",
|
473 |
+
"def decoder_block(inputs, skip, filters, use_norm=True):\n",
|
474 |
+
" x = layers.Conv2DTranspose(filters, KERNEL, strides=2, padding='same')(inputs)\n",
|
475 |
+
" x = layers.maximum([x, skip])\n",
|
476 |
+
" x = residual_block(x, filters, use_norm)\n",
|
477 |
+
" return x\n"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
{
|
481 |
+
"cell_type": "markdown",
|
482 |
+
"metadata": {},
|
483 |
+
"source": [
|
484 |
+
"* ===================== Generator =====================*"
|
485 |
+
]
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"cell_type": "markdown",
|
489 |
+
"metadata": {},
|
490 |
+
"source": [
|
491 |
+
"This function builds the generator model for a Variational Autoencoder (VAE) with a CycleGAN architecture. The generator is responsible for converting a CT scan into an MRI image (or vice versa) by learning to map the two domains.\n",
|
492 |
+
"\n",
|
493 |
+
".\n",
|
494 |
+
"\n",
|
495 |
+
"🛠️ What This Function Does?\n",
|
496 |
+
"\n",
|
497 |
+
"It encodes an input image into a latent space.\n",
|
498 |
+
"\n",
|
499 |
+
"It applies variational sampling to introduce a probabilistic distribution.\n",
|
500 |
+
"\n",
|
501 |
+
"It decodes the latent representation back into an image.\n",
|
502 |
+
"\n",
|
503 |
+
"Uses skip connections to retain features across layers.\n",
|
504 |
+
"\n",
|
505 |
+
"1️⃣ Input Layer\n",
|
506 |
+
"\n",
|
507 |
+
"Defines the input tensor with the given IMAGE_SHAPE (e.g., (256, 256, 3), for RGB images).\n",
|
508 |
+
"\n",
|
509 |
+
"2️⃣ Encoder: Downsampling the Image\n",
|
510 |
+
"\n",
|
511 |
+
" Each encoder block halves the spatial resolution but doubles the filters.\n",
|
512 |
+
"\n",
|
513 |
+
" Stores skip connections (s1, s2, ..., s7) for later use in the decoder.\n",
|
514 |
+
"\n",
|
515 |
+
" After e7, the image is highly compressed into a feature map.\n",
|
516 |
+
"\n",
|
517 |
+
"3️⃣ Latent Space (Variational Sampling)\n",
|
518 |
+
"\n",
|
519 |
+
" Flattens the feature map into a 1D vector.\n",
|
520 |
+
"\n",
|
521 |
+
" Uses two dense layers to compute:\n",
|
522 |
+
"\n",
|
523 |
+
" z_mean → The mean of the latent distribution.\n",
|
524 |
+
"\n",
|
525 |
+
" z_log_var → The logarithm of the variance.\n",
|
526 |
+
"\n",
|
527 |
+
" Uses reparameterization trick (Sampling layer) to ensure backpropagation works in VAE.\n",
|
528 |
+
"\n",
|
529 |
+
"4️⃣ Reshape for Decoder\n",
|
530 |
+
"\n",
|
531 |
+
" Expands z into a 2x2 feature map to match e7 dimensions.\n",
|
532 |
+
"\n",
|
533 |
+
" Prepares the latent vector for decoding.\n",
|
534 |
+
"\n",
|
535 |
+
"5️⃣ Decoder: Upsampling the Image\n",
|
536 |
+
"\n",
|
537 |
+
" Each decoder block upsamples the feature map back to the original size.\n",
|
538 |
+
"\n",
|
539 |
+
" Uses skip connections (s1, s2, ..., s7) to restore spatial information.\n",
|
540 |
+
"\n",
|
541 |
+
" Mirrors the encoder process but in reverse.\n",
|
542 |
+
"\n",
|
543 |
+
"6️⃣ Final Output Layer\n",
|
544 |
+
"\n",
|
545 |
+
" Uses a Conv2D layer to produce the final RGB image.\n",
|
546 |
+
" \n",
|
547 |
+
" Applies sigmoid activation to ensure pixel values remain between [0,1].\n",
|
548 |
+
"\n",
|
549 |
+
"\n",
|
550 |
+
"\n",
|
551 |
+
"\n",
|
552 |
+
"\n",
|
553 |
+
"\n"
|
554 |
+
]
|
555 |
+
},
|
556 |
+
{
|
557 |
+
"cell_type": "code",
|
558 |
+
"execution_count": 7,
|
559 |
+
"metadata": {},
|
560 |
+
"outputs": [],
|
561 |
+
"source": [
|
562 |
+
"def build_generator(name):\n",
|
563 |
+
" inputs = layers.Input(IMAGE_SHAPE)\n",
|
564 |
+
" \n",
|
565 |
+
" # Encoder\n",
|
566 |
+
" e1, s1 = encoder_block(inputs, FILTERS)\n",
|
567 |
+
" e2, s2 = encoder_block(e1, FILTERS*2)\n",
|
568 |
+
" e3, s3 = encoder_block(e2, FILTERS*4)\n",
|
569 |
+
" e4, s4 = encoder_block(e3, FILTERS*8)\n",
|
570 |
+
" e5, s5 = encoder_block(e4, FILTERS*16)\n",
|
571 |
+
" e6, s6 = encoder_block(e5, FILTERS*32)\n",
|
572 |
+
" e7, s7 = encoder_block(e6, FILTERS*64)\n",
|
573 |
+
" \n",
|
574 |
+
" # Latent Space\n",
|
575 |
+
" x = layers.Flatten()(e7)\n",
|
576 |
+
" z_mean = layers.Dense(LATENT_DIM, name=f\"z_mean_{name.split('_')[-1]}\")(x)\n",
|
577 |
+
" z_log_var = layers.Dense(LATENT_DIM, name=f\"z_log_var_{name.split('_')[-1]}\")(x)\n",
|
578 |
+
" z = Sampling()([z_mean, z_log_var])\n",
|
579 |
+
" \n",
|
580 |
+
" # Reshape for decoder\n",
|
581 |
+
" x = layers.Dense(2 * 2 * FILTERS*64)(z)\n",
|
582 |
+
" x = layers.Reshape((2, 2, FILTERS*64))(x)\n",
|
583 |
+
" \n",
|
584 |
+
" # Decoder\n",
|
585 |
+
" d0 = decoder_block(x, s7, FILTERS*64)\n",
|
586 |
+
" d1 = decoder_block(d0, s6, FILTERS*32)\n",
|
587 |
+
" d2 = decoder_block(d1, s5, FILTERS*16)\n",
|
588 |
+
" d3 = decoder_block(d2, s4, FILTERS*8)\n",
|
589 |
+
" d4 = decoder_block(d3, s3, FILTERS*4)\n",
|
590 |
+
" d5 = decoder_block(d4, s2, FILTERS*2)\n",
|
591 |
+
" d6 = decoder_block(d5, s1, FILTERS)\n",
|
592 |
+
" \n",
|
593 |
+
" outputs = layers.Conv2D(3, KERNEL, activation='sigmoid', padding='same')(d6)\n",
|
594 |
+
" return Model(inputs, [outputs, z_mean, z_log_var], name=name)"
|
595 |
+
]
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"cell_type": "markdown",
|
599 |
+
"metadata": {},
|
600 |
+
"source": [
|
601 |
+
"\n",
|
602 |
+
"*===================== Discriminator =====================*\n",
|
603 |
+
"\n",
|
604 |
+
"\n",
|
605 |
+
"This function constructs the discriminator in a Generative Adversarial Network (GAN). The discriminator’s role is to classify an image as real or fake by extracting hierarchical features and making multi-scale predictions.\n",
|
606 |
+
"\n",
|
607 |
+
"What Does This Function Do?\n",
|
608 |
+
"\n",
|
609 |
+
" Extracts features from the input image using convolutional layers.\n",
|
610 |
+
"\n",
|
611 |
+
" Downsamples the image through multiple layers to capture both local and global features.\n",
|
612 |
+
"\n",
|
613 |
+
" Generates multiple outputs from different feature scales for better discrimination.\n",
|
614 |
+
"\n",
|
615 |
+
"1️⃣ Input Layer \n",
|
616 |
+
"\n",
|
617 |
+
" Defines the input tensor with a shape of IMAGE_SHAPE (e.g., (256, 256, 3) for RGB images).\n",
|
618 |
+
"\n",
|
619 |
+
" This means the discriminator takes an image as input.\n",
|
620 |
+
"\n",
|
621 |
+
"2️⃣ Feature Extraction\n",
|
622 |
+
"\n",
|
623 |
+
" x = inputs initializes x as the input image.\n",
|
624 |
+
"\n",
|
625 |
+
" features = [] creates a list to store intermediate feature map\n",
|
626 |
+
"\n",
|
627 |
+
"3️⃣ Initial Convolution\n",
|
628 |
+
"\n",
|
629 |
+
" Applies a convolutional layer (Conv2D) with FILTERS (e.g., 64 filters) to extract basic edges and textures.\n",
|
630 |
+
"\n",
|
631 |
+
" Uses LeakyReLU activation (alpha=0.2) instead of ReLU to allow small gradients for negative values.\n",
|
632 |
+
"\n",
|
633 |
+
" Stores the feature map in features.\n",
|
634 |
+
"\n",
|
635 |
+
"4️⃣ Downsampling Blocks (Feature Hierarchy)\n",
|
636 |
+
"\n",
|
637 |
+
" Defines filter_sizes, increasing filter count at each stage to learn complex features.\n",
|
638 |
+
"\n",
|
639 |
+
" Uses a loop to pass x through multiple encoder_block layers:\n",
|
640 |
+
"\n",
|
641 |
+
" Each encoder_block downsamples the feature map (reducing spatial size).\n",
|
642 |
+
"\n",
|
643 |
+
" Each block doubles the number of filters to capture more detailed features.\n",
|
644 |
+
"\n",
|
645 |
+
" Stores all extracted feature maps in features.\n",
|
646 |
+
"\n",
|
647 |
+
"5️⃣ Multi-Scale Outputs (Final Classification Layers)\n",
|
648 |
+
"\n",
|
649 |
+
" The discriminator does not produce a single output; it uses multiple feature scales.\n",
|
650 |
+
"\n",
|
651 |
+
" Extracts the last 4 feature maps (features[-4:]) to classify at different resolutions.\n",
|
652 |
+
"\n",
|
653 |
+
" Each feature map is passed through a final Conv2D layer with 1 filter to predict real vs fake scores.\n",
|
654 |
+
"\n",
|
655 |
+
" Stores the outputs in outputs.\n",
|
656 |
+
"\n",
|
657 |
+
"6️⃣ Return the Discriminator Model\n",
|
658 |
+
"\n",
|
659 |
+
" Creates a Keras Model that takes an image as input and outputs multiple classification scores.\n",
|
660 |
+
"\n",
|
661 |
+
" This helps in making fine-grained real/fake decisions.\n"
|
662 |
+
]
|
663 |
+
},
|
664 |
+
{
|
665 |
+
"cell_type": "code",
|
666 |
+
"execution_count": 9,
|
667 |
+
"metadata": {},
|
668 |
+
"outputs": [],
|
669 |
+
"source": [
|
670 |
+
"def build_discriminator(name):\n",
|
671 |
+
" inputs = layers.Input(IMAGE_SHAPE)\n",
|
672 |
+
" \n",
|
673 |
+
" # Feature extraction\n",
|
674 |
+
" x = inputs\n",
|
675 |
+
" features = []\n",
|
676 |
+
" \n",
|
677 |
+
" # Initial convolution\n",
|
678 |
+
" x = layers.Conv2D(FILTERS, KERNEL, padding='same')(x)\n",
|
679 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
680 |
+
" features.append(x)\n",
|
681 |
+
" \n",
|
682 |
+
" # Downsampling blocks\n",
|
683 |
+
" filter_sizes = [FILTERS*2, FILTERS*4, FILTERS*8, FILTERS*16, FILTERS*32, FILTERS*64]\n",
|
684 |
+
" for filters in filter_sizes:\n",
|
685 |
+
" x, _ = encoder_block(x, filters, use_norm=False)\n",
|
686 |
+
" features.append(x)\n",
|
687 |
+
" \n",
|
688 |
+
" # Multi-scale outputs\n",
|
689 |
+
" outputs = []\n",
|
690 |
+
" for i, feature in enumerate(features[-4:]):\n",
|
691 |
+
" out = layers.Conv2D(1, KERNEL, padding='same')(feature)\n",
|
692 |
+
" outputs.append(out)\n",
|
693 |
+
" \n",
|
694 |
+
" return Model(inputs, outputs, name=name)\n"
|
695 |
+
]
|
696 |
+
},
|
697 |
+
{
|
698 |
+
"cell_type": "markdown",
|
699 |
+
"metadata": {},
|
700 |
+
"source": [
|
701 |
+
"*===================== Data Loading =====================*\n",
|
702 |
+
"\n"
|
703 |
+
]
|
704 |
+
},
|
705 |
+
{
|
706 |
+
"cell_type": "code",
|
707 |
+
"execution_count": 10,
|
708 |
+
"metadata": {},
|
709 |
+
"outputs": [],
|
710 |
+
"source": [
|
711 |
+
"def load_images(path):\n",
|
712 |
+
" images = []\n",
|
713 |
+
" for p in pathlib.Path(path).glob('*.*'):\n",
|
714 |
+
" try:\n",
|
715 |
+
" img = cv2.imread(str(p))\n",
|
716 |
+
" if img is not None:\n",
|
717 |
+
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
|
718 |
+
" img = cv2.resize(img, IMAGE_SHAPE[:2])\n",
|
719 |
+
" img = img.astype(np.float32) / 255.0\n",
|
720 |
+
" images.append(img)\n",
|
721 |
+
" except Exception as e:\n",
|
722 |
+
" print(f\"Error loading image {p}: {e}\")\n",
|
723 |
+
" return np.array(images)"
|
724 |
+
]
|
725 |
+
},
|
726 |
+
{
|
727 |
+
"cell_type": "markdown",
|
728 |
+
"metadata": {},
|
729 |
+
"source": [
|
730 |
+
"This function is responsible for loading and balancing two different medical imaging datasets: CT scans and MRI scans. The goal is to ensure that both datasets contain the same number of images to avoid class imbalance in training.\n",
|
731 |
+
"\n",
|
732 |
+
"\n",
|
733 |
+
"📌 What Does This Function Do?\n",
|
734 |
+
"\n",
|
735 |
+
"Loads CT scans from the given directory.\n",
|
736 |
+
"Loads MRI scans from the given directory.\n",
|
737 |
+
"Finds the smaller dataset (CT or MRI) and trims the larger one to match its size.\n",
|
738 |
+
"Returns balanced datasets with the same number of images.\n",
|
739 |
+
"\n",
|
740 |
+
"1️⃣ Loading CT Scans:\n",
|
741 |
+
"\n",
|
742 |
+
" Prints \"Loading CT scans...\" to inform the user.\n",
|
743 |
+
"\n",
|
744 |
+
" Calls load_images(ct_path), a function (likely defined elsewhere) that reads images from the directory specified by ct_path.\n",
|
745 |
+
"\n",
|
746 |
+
" Stores the loaded images in ct_scans.\n",
|
747 |
+
"\n",
|
748 |
+
"2️⃣ Loading MRI Scans\n",
|
749 |
+
"\n",
|
750 |
+
" Prints \"Loading MRI scans...\" to indicate MRI loading.\n",
|
751 |
+
"\n",
|
752 |
+
" Calls load_images(mri_path), which loads images from mri_path.\n",
|
753 |
+
"\n",
|
754 |
+
" Stores the MRI images in mri_scans.\n",
|
755 |
+
"\n",
|
756 |
+
"3️⃣ Balancing the Datasets\n",
|
757 |
+
"\n",
|
758 |
+
" Computes the minimum length between the two datasets.\n",
|
759 |
+
"\n",
|
760 |
+
" Ensures that the dataset with more images is trimmed to match the smaller one.\n",
|
761 |
+
"\n",
|
762 |
+
" Computes the minimum length between the two datasets.\n",
|
763 |
+
"\n",
|
764 |
+
" Ensures that the dataset with more images is trimmed to match the smaller one.\n",
|
765 |
+
"\n",
|
766 |
+
"\n",
|
767 |
+
"\n",
|
768 |
+
"\n",
|
769 |
+
"\n",
|
770 |
+
"\n",
|
771 |
+
"\n",
|
772 |
+
"\n",
|
773 |
+
" \n",
|
774 |
+
"\n"
|
775 |
+
]
|
776 |
+
},
|
777 |
+
{
|
778 |
+
"cell_type": "code",
|
779 |
+
"execution_count": 11,
|
780 |
+
"metadata": {},
|
781 |
+
"outputs": [],
|
782 |
+
"source": [
|
783 |
+
"\n",
|
784 |
+
"def load_and_balance_datasets(ct_path, mri_path):\n",
|
785 |
+
" print(\"Loading CT scans...\")\n",
|
786 |
+
" ct_scans = load_images(ct_path)\n",
|
787 |
+
" print(\"Loading MRI scans...\")\n",
|
788 |
+
" mri_scans = load_images(mri_path)\n",
|
789 |
+
" \n",
|
790 |
+
" min_length = min(len(ct_scans), len(mri_scans))\n",
|
791 |
+
" ct_scans = ct_scans[:min_length]\n",
|
792 |
+
" mri_scans = mri_scans[:min_length]\n",
|
793 |
+
" \n",
|
794 |
+
" print(f\"Balanced datasets to {min_length} images each\")\n",
|
795 |
+
" return ct_scans, mri_scans"
|
796 |
+
]
|
797 |
+
},
|
798 |
+
{
|
799 |
+
"cell_type": "markdown",
|
800 |
+
"metadata": {},
|
801 |
+
"source": [
|
802 |
+
"*Training Setup - Detailed Explanation*\n",
|
803 |
+
"\n",
|
804 |
+
"This block of code sets up the models and optimizers required for training a CycleGAN for CT ↔ MRI image translation. Let’s break it down step by step.\n",
|
805 |
+
"\n",
|
806 |
+
"📌 What Does This Code Do?\n",
|
807 |
+
" Builds the generator models (CT → MRI and MRI → CT).\n",
|
808 |
+
"\n",
|
809 |
+
" Builds the discriminator models for CT and MRI.\n",
|
810 |
+
"\n",
|
811 |
+
" Creates optimizers for training the generators and discriminators.\n",
|
812 |
+
"\n",
|
813 |
+
" Initializes model variables (trainable parameters for both generators and discriminators).\n",
|
814 |
+
"\n",
|
815 |
+
" Builds optimizers using the trainable variables.\n",
|
816 |
+
"\n",
|
817 |
+
"1️⃣ Building the Generator Models\n",
|
818 |
+
"\n",
|
819 |
+
" build_generator(name): This function (explained earlier) builds a U-Net-based Variational Autoencoder (VAE) generator.\n",
|
820 |
+
"\n",
|
821 |
+
" g_ct_mri: The generator that converts CT scans → MRI images.\n",
|
822 |
+
"\n",
|
823 |
+
" g_mri_ct: The generator that converts MRI images → CT scans\n",
|
824 |
+
"\n",
|
825 |
+
"2️⃣ Building the Discriminator Models\n",
|
826 |
+
"\n",
|
827 |
+
" build_discriminator(name): This function (explained earlier) builds the discriminators to differentiate real and fake images.\n",
|
828 |
+
"\n",
|
829 |
+
" d_ct: The discriminator that distinguishes real CT scans from fake ones.\n",
|
830 |
+
"\n",
|
831 |
+
" d_mri: The discriminator that distinguishes real MRI scans from fake ones.\n",
|
832 |
+
"\n",
|
833 |
+
"\n",
|
834 |
+
"3️⃣ Creating Optimizers\n",
|
835 |
+
"\n",
|
836 |
+
" g_opt: Optimizer for training both generators.\n",
|
837 |
+
"\n",
|
838 |
+
" d_opt: Optimizer for training both discriminators.\n",
|
839 |
+
"\n",
|
840 |
+
" Uses RMSprop as the optimizer.\n",
|
841 |
+
"\n",
|
842 |
+
" The learning rate (LEARNING_RATE) controls the step size for updates.\n",
|
843 |
+
"\n",
|
844 |
+
" Weight decay (WEIGHT_DECAY) prevents overfitting by penalizing large weights.\n",
|
845 |
+
"\n",
|
846 |
+
"4️⃣ Initializing Model Variables\n",
|
847 |
+
"\n",
|
848 |
+
" g_vars: Stores all trainable variables (weights & biases) of both generators.\n",
|
849 |
+
" d_vars: Stores all trainable variables of both discriminators.\n",
|
850 |
+
"\n",
|
851 |
+
" 📝 Why store trainable variables separately?\n",
|
852 |
+
"\n",
|
853 |
+
" Since generators and discriminators have separate losses, they need to be updated separately.\n",
|
854 |
+
"\n",
|
855 |
+
"5️⃣ Building Optimizers with Model Variables\n",
|
856 |
+
"\n",
|
857 |
+
" g_opt.build(g_vars): Tells TensorFlow that g_opt will optimize generator variables.\n",
|
858 |
+
"\n",
|
859 |
+
" d_opt.build(d_vars): Tells TensorFlow that d_opt will optimize discriminator variables.\n",
|
860 |
+
"\n",
|
861 |
+
" 📝 Why explicitly build the optimizers?\n",
|
862 |
+
"\n",
|
863 |
+
" In Eager Execution mode, TensorFlow automatically tracks variables.\n",
|
864 |
+
" \n",
|
865 |
+
" However, explicitly calling build() can help with performance optimization.\n",
|
866 |
+
"\n",
|
867 |
+
"\n"
|
868 |
+
]
|
869 |
+
},
|
870 |
+
{
|
871 |
+
"cell_type": "code",
|
872 |
+
"execution_count": 12,
|
873 |
+
"metadata": {},
|
874 |
+
"outputs": [
|
875 |
+
{
|
876 |
+
"name": "stdout",
|
877 |
+
"output_type": "stream",
|
878 |
+
"text": [
|
879 |
+
"WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n",
|
880 |
+
"\n",
|
881 |
+
"WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\layers\\pooling\\max_pooling2d.py:161: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
|
882 |
+
"\n"
|
883 |
+
]
|
884 |
+
},
|
885 |
+
{
|
886 |
+
"ename": "NameError",
|
887 |
+
"evalue": "name 'Sampling' is not defined",
|
888 |
+
"output_type": "error",
|
889 |
+
"traceback": [
|
890 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
891 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
892 |
+
"Cell \u001b[1;32mIn[12], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ===================== Training Setup =====================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Build models\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m g_ct_mri \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mCT_to_MRI\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m g_mri_ct \u001b[38;5;241m=\u001b[39m build_generator(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMRI_to_CT\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 5\u001b[0m d_ct \u001b[38;5;241m=\u001b[39m build_discriminator(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD_CT\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
893 |
+
"Cell \u001b[1;32mIn[7], line 17\u001b[0m, in \u001b[0;36mbuild_generator\u001b[1;34m(name)\u001b[0m\n\u001b[0;32m 15\u001b[0m z_mean \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(LATENT_DIM, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mz_mean_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)(x)\n\u001b[0;32m 16\u001b[0m z_log_var \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(LATENT_DIM, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mz_log_var_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)(x)\n\u001b[1;32m---> 17\u001b[0m z \u001b[38;5;241m=\u001b[39m \u001b[43mSampling\u001b[49m()([z_mean, z_log_var])\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# Reshape for decoder\u001b[39;00m\n\u001b[0;32m 20\u001b[0m x \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m FILTERS\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m64\u001b[39m)(z)\n",
|
894 |
+
"\u001b[1;31mNameError\u001b[0m: name 'Sampling' is not defined"
|
895 |
+
]
|
896 |
+
}
|
897 |
+
],
|
898 |
+
"source": [
|
899 |
+
"# ===================== Training Setup =====================\n",
|
900 |
+
"# Build models\n",
|
901 |
+
"g_ct_mri = build_generator('CT_to_MRI')\n",
|
902 |
+
"g_mri_ct = build_generator('MRI_to_CT')\n",
|
903 |
+
"d_ct = build_discriminator('D_CT')\n",
|
904 |
+
"d_mri = build_discriminator('D_MRI')\n",
|
905 |
+
"\n",
|
906 |
+
"# Create optimizers\n",
|
907 |
+
"g_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
|
908 |
+
"d_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
|
909 |
+
"\n",
|
910 |
+
"# Initialize model variables\n",
|
911 |
+
"g_vars = g_ct_mri.trainable_variables + g_mri_ct.trainable_variables\n",
|
912 |
+
"d_vars = d_ct.trainable_variables + d_mri.trainable_variables\n",
|
913 |
+
"\n",
|
914 |
+
"# Build optimizers\n",
|
915 |
+
"g_opt.build(g_vars)\n",
|
916 |
+
"d_opt.build(d_vars)"
|
917 |
+
]
|
918 |
+
},
|
919 |
+
{
|
920 |
+
"cell_type": "markdown",
|
921 |
+
"metadata": {},
|
922 |
+
"source": [
|
923 |
+
"*Explanation of train_step Function in CycleGAN with Variational Autoencoder (VAE)*\n",
|
924 |
+
"\n",
|
925 |
+
"This function performs one training step for the CycleGAN with VAE-style latent representations. It does the following:\n",
|
926 |
+
"\n",
|
927 |
+
"Generates fake images using the generators.\n",
|
928 |
+
"\n",
|
929 |
+
"Evaluates the fake and real images using the discriminators.\n",
|
930 |
+
"\n",
|
931 |
+
"Computes the loss functions for both generators and discriminators.\n",
|
932 |
+
"\n",
|
933 |
+
"Computes gradients and updates the model parameters.\n",
|
934 |
+
"\n",
|
935 |
+
"1️⃣ Forward Pass - Generate Fake Images\n",
|
936 |
+
"\n",
|
937 |
+
" g_ct_mri(real_ct): Translates CT → Fake MRI and produces:\n",
|
938 |
+
" \n",
|
939 |
+
" fake_mri: The generated MRI image.\n",
|
940 |
+
" z_mean_fwd, z_log_var_fwd: Latent variables (from the Variational Autoencoder).\n",
|
941 |
+
" g_mri_ct(real_mri): Translates MRI → Fake CT with similar outputs.\n",
|
942 |
+
"\n",
|
943 |
+
" 📝 Why store z_mean and z_log_var?\n",
|
944 |
+
"\n",
|
945 |
+
" These come from the VAE latent space and are used for the KL divergence loss.\n",
|
946 |
+
"\n",
|
947 |
+
"2️⃣ Compute Discriminator Outputs\n",
|
948 |
+
"\n",
|
949 |
+
" d_ct(real_ct): Discriminator’s prediction for real CT images.\n",
|
950 |
+
"\n",
|
951 |
+
" d_ct(fake_ct): Discriminator’s prediction for fake CT images.\n",
|
952 |
+
"\n",
|
953 |
+
" d_mri(real_mri): Discriminator’s prediction for real MRI images.\n",
|
954 |
+
"\n",
|
955 |
+
" d_mri(fake_mri): Discriminator’s prediction for fake MRI images.\n",
|
956 |
+
"\n",
|
957 |
+
" 📝 Goal of Discriminators?\n",
|
958 |
+
"\n",
|
959 |
+
"\n",
|
960 |
+
" Real images should be classified close to 1.\n",
|
961 |
+
"\n",
|
962 |
+
" Fake images should be classified close to 0.\n",
|
963 |
+
"\n",
|
964 |
+
"3️⃣ Compute Discriminator Losses\n",
|
965 |
+
"\n",
|
966 |
+
" Uses Least Squares GAN (LSGAN) loss:\n",
|
967 |
+
"\n",
|
968 |
+
" For real images: (real - 1)^2 → Encourages real images to be classified as 1.\n",
|
969 |
+
"\n",
|
970 |
+
" For fake images: fake^2 → Encourages fake images to be classified as 0.\n",
|
971 |
+
"\n",
|
972 |
+
" sum([...]): If there are multiple output layers in the discriminator, we sum their losses.\n",
|
973 |
+
"\n",
|
974 |
+
" 📝 Why LSGAN loss?\n",
|
975 |
+
"\n",
|
976 |
+
" Helps stabilize training compared to standard GAN loss.\n",
|
977 |
+
"\n",
|
978 |
+
"\n",
|
979 |
+
"4️⃣ Cycle Consistency Loss (CycleGAN Component)\n",
|
980 |
+
"\n",
|
981 |
+
" cycled_ct = g_mri_ct(fake_mri): The fake MRI is translated back to CT.\n",
|
982 |
+
"\n",
|
983 |
+
" cycled_mri = g_ct_mri(fake_ct): The fake CT is translated back to MRI.\n",
|
984 |
+
"\n",
|
985 |
+
" 📝 Why cycle consistency?\n",
|
986 |
+
"\n",
|
987 |
+
" The network should learn round-trip consistency:\n",
|
988 |
+
"\n",
|
989 |
+
" CT → Fake MRI → CT (should look like original CT)\n",
|
990 |
+
"\n",
|
991 |
+
" MRI → Fake CT → MRI (should look like original MRI)\n",
|
992 |
+
"\n",
|
993 |
+
"5️⃣ KL Divergence Loss (VAE Component)\n",
|
994 |
+
"\n",
|
995 |
+
" This is the KL divergence loss from VAE:\n",
|
996 |
+
"\n",
|
997 |
+
" Encourages the latent space to follow a Gaussian distribution.\n",
|
998 |
+
"\n",
|
999 |
+
" Prevents mode collapse.\n",
|
1000 |
+
"\n",
|
1001 |
+
" 📝 Why add KL divergence loss?\n",
|
1002 |
+
"\n",
|
1003 |
+
" Regularizes the latent space so the generator produces diverse outputs.\n",
|
1004 |
+
"\n",
|
1005 |
+
"6️⃣ Compute Generator Losses\n",
|
1006 |
+
"\n",
|
1007 |
+
" The generator wants fake images to be classified as real (1), so we use:\n",
|
1008 |
+
"\n",
|
1009 |
+
" (fake - 1)^2 → Fake images should be close to 1.\n",
|
1010 |
+
"\n",
|
1011 |
+
" Cycle consistency loss: L1 loss (|original - reconstructed|).\n",
|
1012 |
+
"\n",
|
1013 |
+
" Encourages faithful reconstructions.\n",
|
1014 |
+
"\n",
|
1015 |
+
"\n",
|
1016 |
+
" Final generator loss combines:\n",
|
1017 |
+
"\n",
|
1018 |
+
" Adversarial loss (GAN loss).\n",
|
1019 |
+
"\n",
|
1020 |
+
" Cycle consistency loss (weighted by 10 for stronger enforcement).\n",
|
1021 |
+
"\n",
|
1022 |
+
" KL divergence loss (weighted by 0.5).\n",
|
1023 |
+
"\n",
|
1024 |
+
" \n",
|
1025 |
+
"7️⃣ Compute Total Discriminator Loss\n",
|
1026 |
+
"\n",
|
1027 |
+
"Adds both discriminator losses.\n",
|
1028 |
+
"\n",
|
1029 |
+
"8️⃣ Compute Gradients & Update Model Parameters\n",
|
1030 |
+
"\n",
|
1031 |
+
" Computes gradients of discriminator loss (d_total_loss).\n",
|
1032 |
+
"\n",
|
1033 |
+
" Updates discriminator weights (d_vars).\n",
|
1034 |
+
"\n",
|
1035 |
+
" Computes gradients of generator loss (g_total_loss).\n",
|
1036 |
+
"\n",
|
1037 |
+
" Updates generator weights (g_vars).\n",
|
1038 |
+
"\n",
|
1039 |
+
"📝 Why use tf.GradientTape(persistent=True)?\n",
|
1040 |
+
"\n",
|
1041 |
+
" We need gradients twice (once for discriminators, once for generators).\n",
|
1042 |
+
"\n",
|
1043 |
+
"\n"
|
1044 |
+
]
|
1045 |
+
},
|
1046 |
+
{
|
1047 |
+
"cell_type": "code",
|
1048 |
+
"execution_count": null,
|
1049 |
+
"metadata": {},
|
1050 |
+
"outputs": [],
|
1051 |
+
"source": [
|
1052 |
+
"import os\n",
|
1053 |
+
"@tf.function\n",
|
1054 |
+
"def train_step(real_ct, real_mri):\n",
|
1055 |
+
" with tf.GradientTape(persistent=True) as tape:\n",
|
1056 |
+
" # Forward passes\n",
|
1057 |
+
" fake_mri, z_mean_fwd, z_log_var_fwd = g_ct_mri(real_ct, training=True)\n",
|
1058 |
+
" fake_ct, z_mean_bwd, z_log_var_bwd = g_mri_ct(real_mri, training=True)\n",
|
1059 |
+
" \n",
|
1060 |
+
" # Discriminator outputs\n",
|
1061 |
+
" d_real_ct = d_ct(real_ct, training=True)\n",
|
1062 |
+
" d_fake_ct = d_ct(fake_ct, training=True)\n",
|
1063 |
+
" d_real_mri = d_mri(real_mri, training=True)\n",
|
1064 |
+
" d_fake_mri = d_mri(fake_mri, training=True)\n",
|
1065 |
+
" \n",
|
1066 |
+
" # Discriminator losses\n",
|
1067 |
+
" d_ct_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
|
1068 |
+
" for real, fake in zip(d_real_ct, d_fake_ct)])\n",
|
1069 |
+
" d_mri_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
|
1070 |
+
" for real, fake in zip(d_real_mri, d_fake_mri)])\n",
|
1071 |
+
" \n",
|
1072 |
+
" # Cycle consistency\n",
|
1073 |
+
" cycled_ct, _, _ = g_mri_ct(fake_mri, training=True)\n",
|
1074 |
+
" cycled_mri, _, _ = g_ct_mri(fake_ct, training=True)\n",
|
1075 |
+
" \n",
|
1076 |
+
" # KL Divergence\n",
|
1077 |
+
" kl_fwd = -0.5 * tf.reduce_mean(1 + z_log_var_fwd - tf.square(z_mean_fwd) - tf.exp(z_log_var_fwd))\n",
|
1078 |
+
" kl_bwd = -0.5 * tf.reduce_mean(1 + z_log_var_bwd - tf.square(z_mean_bwd) - tf.exp(z_log_var_bwd))\n",
|
1079 |
+
" \n",
|
1080 |
+
" # Generator losses\n",
|
1081 |
+
" g_adv_loss = sum([tf.reduce_mean((fake - 1)**2) for fake in d_fake_mri + d_fake_ct])\n",
|
1082 |
+
" g_cycle_loss = (tf.reduce_mean(tf.abs(real_ct - cycled_ct)) + \n",
|
1083 |
+
" tf.reduce_mean(tf.abs(real_mri - cycled_mri)))\n",
|
1084 |
+
" g_total_loss = g_adv_loss + 10 * g_cycle_loss + 0.5 * (kl_fwd + kl_bwd)\n",
|
1085 |
+
" \n",
|
1086 |
+
" # Total discriminator loss\n",
|
1087 |
+
" d_total_loss = d_ct_loss + d_mri_loss\n",
|
1088 |
+
" \n",
|
1089 |
+
" # Update discriminators\n",
|
1090 |
+
" d_grads = tape.gradient(d_total_loss, d_vars)\n",
|
1091 |
+
" d_opt.apply_gradients(zip(d_grads, d_vars))\n",
|
1092 |
+
" \n",
|
1093 |
+
" # Update generators\n",
|
1094 |
+
" g_grads = tape.gradient(g_total_loss, g_vars)\n",
|
1095 |
+
" g_opt.apply_gradients(zip(g_grads, g_vars))\n",
|
1096 |
+
" \n",
|
1097 |
+
" return {\n",
|
1098 |
+
" 'd_ct': d_ct_loss,\n",
|
1099 |
+
" 'd_mri': d_mri_loss,\n",
|
1100 |
+
" 'g_total': g_total_loss,\n",
|
1101 |
+
" 'fake_mri': fake_mri,\n",
|
1102 |
+
" 'fake_ct': fake_ct\n",
|
1103 |
+
" }"
|
1104 |
+
]
|
1105 |
+
},
|
1106 |
+
{
|
1107 |
+
"cell_type": "markdown",
|
1108 |
+
"metadata": {},
|
1109 |
+
"source": [
|
1110 |
+
"This code defines the main training loop for a CycleGAN-based model that translates between CT and MRI images. It consists of data preparation, training iteration, progress tracking, and model saving. Below is a step-by-step breakdown:\n",
|
1111 |
+
"\n",
|
1112 |
+
"\n",
|
1113 |
+
"1. Create Progress Directory\n",
|
1114 |
+
"\n",
|
1115 |
+
"The script creates a directory named progress/ inside Kaggle's working directory.\n",
|
1116 |
+
"\n",
|
1117 |
+
"This directory will store progress images showing how well the model is learning over time.\n",
|
1118 |
+
"\n",
|
1119 |
+
"2. Load and Balance the Datasets\n",
|
1120 |
+
"\n",
|
1121 |
+
"Calls load_and_balance_datasets() to load CT and MRI images from the dataset folders.\n",
|
1122 |
+
"\n",
|
1123 |
+
"Ensures both datasets have the same number of images by truncating the larger set.\n",
|
1124 |
+
"\n",
|
1125 |
+
"3. Create TensorFlow Dataset for Training\n",
|
1126 |
+
"\n",
|
1127 |
+
"Creates a TensorFlow dataset from the loaded images.\n",
|
1128 |
+
"\n",
|
1129 |
+
"Shuffles the dataset to introduce randomness and prevent overfitting.\n",
|
1130 |
+
"\n",
|
1131 |
+
"Batches the dataset to process multiple images in parallel during training.\n",
|
1132 |
+
"\n",
|
1133 |
+
"\n",
|
1134 |
+
"4. Training Loop\n",
|
1135 |
+
"\n",
|
1136 |
+
"Starts iterating over epochs (EPOCHS defines the total number of passes over the dataset).\n",
|
1137 |
+
"\n",
|
1138 |
+
"Iterates through mini-batches of CT and MRI scans using train_dataset.\n",
|
1139 |
+
"\n",
|
1140 |
+
"5. Train the Model (Forward & Backward Pass)\n",
|
1141 |
+
"\n",
|
1142 |
+
"Calls train_step(ct_batch, mri_batch), which:\n",
|
1143 |
+
"\n",
|
1144 |
+
" Generates fake MRI from CT (G_CT→MRI) and fake CT from MRI (G_MRI→CT).\n",
|
1145 |
+
"\n",
|
1146 |
+
" Passes both real and fake images through the discriminators (D_CT and D_MRI).\n",
|
1147 |
+
"\n",
|
1148 |
+
" Computes adversarial losses, cycle consistency loss, and KL divergence.\n",
|
1149 |
+
"\n",
|
1150 |
+
" Updates the discriminators (D_CT, D_MRI) and generators (G_CT→MRI, G_MRI→CT).\n",
|
1151 |
+
"\n",
|
1152 |
+
"Stores the loss values (d_ct_loss, d_mri_loss, g_total_loss) and the generated images.\n",
|
1153 |
+
"\n",
|
1154 |
+
"6. Print Losses for Monitoring\n",
|
1155 |
+
"\n",
|
1156 |
+
"Every 10 batches, prints:\n",
|
1157 |
+
"\n",
|
1158 |
+
"D_CT: Discriminator loss for CT.\n",
|
1159 |
+
"\n",
|
1160 |
+
"D_MRI: Discriminator loss for MRI.\n",
|
1161 |
+
"\n",
|
1162 |
+
"G: Total generator loss.\n",
|
1163 |
+
"\n",
|
1164 |
+
"This helps monitor model performance during training.\n",
|
1165 |
+
"\n",
|
1166 |
+
"7. Save Sample Images for Progress Tracking\n",
|
1167 |
+
"\n",
|
1168 |
+
"Every 100 batches, saves progress images to progress/.\n",
|
1169 |
+
"\n",
|
1170 |
+
"Displays real CT & MRI images alongside their fake counterparts generated by the model.\n",
|
1171 |
+
"\n",
|
1172 |
+
"Helps visually track improvements in image quality over time.\n",
|
1173 |
+
"\n"
|
1174 |
+
]
|
1175 |
+
},
|
1176 |
+
{
|
1177 |
+
"cell_type": "code",
|
1178 |
+
"execution_count": 14,
|
1179 |
+
"metadata": {},
|
1180 |
+
"outputs": [
|
1181 |
+
{
|
1182 |
+
"ename": "NameError",
|
1183 |
+
"evalue": "name 'os' is not defined",
|
1184 |
+
"output_type": "error",
|
1185 |
+
"traceback": [
|
1186 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1187 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
1188 |
+
"Cell \u001b[1;32mIn[14], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ===================== Main Training Loop =====================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Create progress directory if it doesn't exist\u001b[39;00m\n\u001b[0;32m 3\u001b[0m progress_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/kaggle/working/progress\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(progress_dir):\n\u001b[0;32m 5\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(progress_dir)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Load and prepare data\u001b[39;00m\n",
|
1189 |
+
"\u001b[1;31mNameError\u001b[0m: name 'os' is not defined"
|
1190 |
+
]
|
1191 |
+
}
|
1192 |
+
],
|
1193 |
+
"source": [
|
1194 |
+
"\n",
|
1195 |
+
"# ===================== Main Training Loop =====================\n",
|
1196 |
+
"# Create progress directory if it doesn't exist\n",
|
1197 |
+
"progress_dir = '/kaggle/working/progress'\n",
|
1198 |
+
"if not os.path.exists(progress_dir):\n",
|
1199 |
+
" os.makedirs(progress_dir)\n",
|
1200 |
+
"\n",
|
1201 |
+
"# Load and prepare data\n",
|
1202 |
+
"print(\"Loading datasets...\")\n",
|
1203 |
+
"ct_scans, mri_scans = load_and_balance_datasets('/kaggle/input/ct-to-mri-cgan/Dataset/images/trainA', \n",
|
1204 |
+
" '/kaggle/input/ct-to-mri-cgan/Dataset/images/trainB')\n",
|
1205 |
+
"\n",
|
1206 |
+
"# Create TensorFlow dataset\n",
|
1207 |
+
"train_dataset = tf.data.Dataset.from_tensor_slices((ct_scans, mri_scans))\n",
|
1208 |
+
"train_dataset = train_dataset.shuffle(buffer_size=len(ct_scans)).batch(BATCH_SIZE)\n",
|
1209 |
+
"# Training loop\n",
|
1210 |
+
"print(\"Starting training...\")\n",
|
1211 |
+
"for epoch in range(EPOCHS):\n",
|
1212 |
+
" for batch_idx, (ct_batch, mri_batch) in enumerate(train_dataset):\n",
|
1213 |
+
" results = train_step(ct_batch, mri_batch)\n",
|
1214 |
+
" \n",
|
1215 |
+
" if batch_idx % 10 == 0:\n",
|
1216 |
+
" print(f\"Epoch {epoch}, Batch {batch_idx}: \"\n",
|
1217 |
+
" f\"D_CT={float(results['d_ct']):.4f}, \"\n",
|
1218 |
+
" f\"D_MRI={float(results['d_mri']):.4f}, \"\n",
|
1219 |
+
" f\"G={float(results['g_total']):.4f}\")\n",
|
1220 |
+
" \n",
|
1221 |
+
" # Save sample images every 100 batches\n",
|
1222 |
+
" if batch_idx % 100 == 0:\n",
|
1223 |
+
" fig, axes = plt.subplots(2, 2, figsize=(10, 10))\n",
|
1224 |
+
" \n",
|
1225 |
+
" # Real CT and Fake MRI\n",
|
1226 |
+
" axes[0,0].imshow(ct_batch[0].numpy())\n",
|
1227 |
+
" axes[0,0].set_title(\"Real CT\")\n",
|
1228 |
+
" axes[0,0].axis('off')\n",
|
1229 |
+
" \n",
|
1230 |
+
" axes[0,1].imshow(results['fake_mri'][0].numpy())\n",
|
1231 |
+
" axes[0,1].set_title(\"Fake MRI\")\n",
|
1232 |
+
" axes[0,1].axis('off')\n",
|
1233 |
+
" \n",
|
1234 |
+
" # Real MRI and Fake CT\n",
|
1235 |
+
" axes[1,0].imshow(mri_batch[0].numpy())\n",
|
1236 |
+
" axes[1,0].set_title(\"Real MRI\")\n",
|
1237 |
+
" axes[1,0].axis('off')\n",
|
1238 |
+
" \n",
|
1239 |
+
" axes[1,1].imshow(results['fake_ct'][0].numpy())\n",
|
1240 |
+
" axes[1,1].set_title(\"Fake CT\")\n",
|
1241 |
+
" axes[1,1].axis('off')\n",
|
1242 |
+
" \n",
|
1243 |
+
" plt.tight_layout()\n",
|
1244 |
+
" plt.savefig(f'progress/epoch_{epoch}_batch_{batch_idx}.png')\n",
|
1245 |
+
" plt.close()\n",
|
1246 |
+
" \n",
|
1247 |
+
" # Save models after each epoch\n",
|
1248 |
+
" save_models(g_ct_mri, g_mri_ct, epoch)"
|
1249 |
+
]
|
1250 |
+
},
|
1251 |
+
{
|
1252 |
+
"cell_type": "code",
|
1253 |
+
"execution_count": null,
|
1254 |
+
"metadata": {},
|
1255 |
+
"outputs": [],
|
1256 |
+
"source": [
|
1257 |
+
"\n",
|
1258 |
+
"def translate_image(model_path, image_path, output_path, mode='ct_to_mri'):\n",
|
1259 |
+
" \"\"\"\n",
|
1260 |
+
" Translate a single image using the trained model\n",
|
1261 |
+
" \n",
|
1262 |
+
" Parameters:\n",
|
1263 |
+
" model_path: Path to the saved model\n",
|
1264 |
+
" image_path: Path to the input image\n",
|
1265 |
+
" output_path: Path to save the translated image\n",
|
1266 |
+
" mode: 'ct_to_mri' or 'mri_to_ct'\n",
|
1267 |
+
" \"\"\"\n",
|
1268 |
+
" # Load model\n",
|
1269 |
+
" print(f\"Loading model from {model_path}\")\n",
|
1270 |
+
" model = tf.keras.models.load_model(model_path, \n",
|
1271 |
+
" custom_objects={'Sampling': Sampling})\n",
|
1272 |
+
" \n",
|
1273 |
+
" # Load and preprocess image\n",
|
1274 |
+
" input_image = load_and_preprocess_image(image_path)\n",
|
1275 |
+
" \n",
|
1276 |
+
" # Generate translation\n",
|
1277 |
+
" print(\"Generating translation...\")\n",
|
1278 |
+
" translated_image, _, _ = model(input_image, training=False)\n",
|
1279 |
+
" \n",
|
1280 |
+
" # Convert to numpy and denormalize\n",
|
1281 |
+
" translated_image = translated_image.numpy()[0] * 255\n",
|
1282 |
+
" translated_image = translated_image.astype(np.uint8)\n",
|
1283 |
+
" \n",
|
1284 |
+
" # Save the result\n",
|
1285 |
+
" print(f\"Saving translated image to {output_path}\")\n",
|
1286 |
+
" plt.figure(figsize=(10, 5))\n",
|
1287 |
+
" \n",
|
1288 |
+
" plt.subplot(1, 2, 1)\n",
|
1289 |
+
" plt.title(\"Input Image\")\n",
|
1290 |
+
" plt.imshow(input_image[0])\n",
|
1291 |
+
" plt.axis('off')\n",
|
1292 |
+
" \n",
|
1293 |
+
" plt.subplot(1, 2, 2)\n",
|
1294 |
+
" plt.title(\"Translated Image\")\n",
|
1295 |
+
" plt.imshow(translated_image)\n",
|
1296 |
+
" plt.axis('off')\n",
|
1297 |
+
" \n",
|
1298 |
+
" plt.tight_layout()\n",
|
1299 |
+
" plt.savefig(output_path)\n",
|
1300 |
+
" plt.close()\n",
|
1301 |
+
" \n",
|
1302 |
+
" return translated_image\n",
|
1303 |
+
"'''\n",
|
1304 |
+
"# Example usage of the translation function\n",
|
1305 |
+
"def example_translation():\n",
|
1306 |
+
" \"\"\"Example of how to use the translation function\"\"\"\n",
|
1307 |
+
" # Paths\n",
|
1308 |
+
" ct_to_mri_model = 'saved_models/ct_to_mri_epoch_1000'\n",
|
1309 |
+
" mri_to_ct_model = 'saved_models/mri_to_ct_epoch_1000'\n",
|
1310 |
+
" \n",
|
1311 |
+
" # CT to MRI translation\n",
|
1312 |
+
" input_ct = 'path/to/your/ct_image.jpg'\n",
|
1313 |
+
" output_mri = 'results/translated_mri.png'\n",
|
1314 |
+
" translated_mri = translate_image(ct_to_mri_model, input_ct, output_mri, \n",
|
1315 |
+
" mode='ct_to_mri')\n",
|
1316 |
+
" \n",
|
1317 |
+
" # MRI to CT translation\n",
|
1318 |
+
" input_mri = 'path/to/your/mri_image.jpg'\n",
|
1319 |
+
" output_ct = 'results/translated_ct.png'\n",
|
1320 |
+
" translated_ct = translate_image(mri_to_ct_model, input_mri, output_ct, \n",
|
1321 |
+
" mode='mri_to_ct')'''"
|
1322 |
+
]
|
1323 |
+
},
|
1324 |
+
{
|
1325 |
+
"cell_type": "markdown",
|
1326 |
+
"metadata": {},
|
1327 |
+
"source": [
|
1328 |
+
"*Complete code in Single Block*"
|
1329 |
+
]
|
1330 |
+
},
|
1331 |
+
{
|
1332 |
+
"cell_type": "code",
|
1333 |
+
"execution_count": null,
|
1334 |
+
"metadata": {},
|
1335 |
+
"outputs": [],
|
1336 |
+
"source": [
|
1337 |
+
"import tensorflow as tf\n",
|
1338 |
+
"from tensorflow.keras import layers, Model\n",
|
1339 |
+
"import numpy as np\n",
|
1340 |
+
"import cv2\n",
|
1341 |
+
"import pathlib\n",
|
1342 |
+
"import matplotlib.pyplot as plt\n",
|
1343 |
+
"import tensorflow_probability as tfp\n",
|
1344 |
+
"\n",
|
1345 |
+
"tfd = tfp.distributions\n",
|
1346 |
+
"\n",
|
1347 |
+
"# ===================== Configuration =====================\n",
|
1348 |
+
"IMAGE_SHAPE = (256, 256, 3)\n",
|
1349 |
+
"LATENT_DIM = 256\n",
|
1350 |
+
"FILTERS = 16\n",
|
1351 |
+
"KERNEL = 3\n",
|
1352 |
+
"LEARNING_RATE = 0.0001\n",
|
1353 |
+
"WEIGHT_DECAY = 6e-8\n",
|
1354 |
+
"BATCH_SIZE = 1\n",
|
1355 |
+
"EPOCHS = 10\n",
|
1356 |
+
"\n",
|
1357 |
+
"# ===================== Architecture Components =====================\n",
|
1358 |
+
"class Sampling(layers.Layer):\n",
|
1359 |
+
" def call(self, inputs):\n",
|
1360 |
+
" z_mean, z_log_var = inputs\n",
|
1361 |
+
" batch = tf.shape(z_mean)[0]\n",
|
1362 |
+
" dim = tf.shape(z_mean)[1]\n",
|
1363 |
+
" epsilon = tf.random.normal(shape=(batch, dim))\n",
|
1364 |
+
" return z_mean + tf.exp(0.5 * z_log_var) * epsilon\n",
|
1365 |
+
"\n",
|
1366 |
+
"def residual_block(inputs, filters, use_norm=True):\n",
|
1367 |
+
" x = layers.Conv2D(filters, KERNEL, padding='same')(inputs)\n",
|
1368 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
1369 |
+
" if use_norm:\n",
|
1370 |
+
" x = layers.GroupNormalization(groups=1)(x)\n",
|
1371 |
+
" x = layers.Conv2D(filters, KERNEL, padding='same')(x)\n",
|
1372 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
1373 |
+
" if use_norm:\n",
|
1374 |
+
" x = layers.GroupNormalization(groups=1)(x)\n",
|
1375 |
+
" shortcut = layers.Conv2D(filters, 1, padding='same')(inputs)\n",
|
1376 |
+
" return layers.maximum([x, shortcut])\n",
|
1377 |
+
"\n",
|
1378 |
+
"def encoder_block(inputs, filters, use_norm=True):\n",
|
1379 |
+
" x = residual_block(inputs, filters, use_norm)\n",
|
1380 |
+
" skip = x\n",
|
1381 |
+
" x = layers.MaxPooling2D()(x)\n",
|
1382 |
+
" return x, skip\n",
|
1383 |
+
"\n",
|
1384 |
+
"def decoder_block(inputs, skip, filters, use_norm=True):\n",
|
1385 |
+
" x = layers.Conv2DTranspose(filters, KERNEL, strides=2, padding='same')(inputs)\n",
|
1386 |
+
" x = layers.maximum([x, skip])\n",
|
1387 |
+
" x = residual_block(x, filters, use_norm)\n",
|
1388 |
+
" return x\n",
|
1389 |
+
"\n",
|
1390 |
+
"# ===================== Generator =====================\n",
|
1391 |
+
"def build_generator(name):\n",
|
1392 |
+
" inputs = layers.Input(IMAGE_SHAPE)\n",
|
1393 |
+
" \n",
|
1394 |
+
" # Encoder\n",
|
1395 |
+
" e1, s1 = encoder_block(inputs, FILTERS)\n",
|
1396 |
+
" e2, s2 = encoder_block(e1, FILTERS*2)\n",
|
1397 |
+
" e3, s3 = encoder_block(e2, FILTERS*4)\n",
|
1398 |
+
" e4, s4 = encoder_block(e3, FILTERS*8)\n",
|
1399 |
+
" e5, s5 = encoder_block(e4, FILTERS*16)\n",
|
1400 |
+
" e6, s6 = encoder_block(e5, FILTERS*32)\n",
|
1401 |
+
" e7, s7 = encoder_block(e6, FILTERS*64)\n",
|
1402 |
+
" \n",
|
1403 |
+
" # Latent Space\n",
|
1404 |
+
" x = layers.Flatten()(e7)\n",
|
1405 |
+
" z_mean = layers.Dense(LATENT_DIM, name=f\"z_mean_{name.split('_')[-1]}\")(x)\n",
|
1406 |
+
" z_log_var = layers.Dense(LATENT_DIM, name=f\"z_log_var_{name.split('_')[-1]}\")(x)\n",
|
1407 |
+
" z = Sampling()([z_mean, z_log_var])\n",
|
1408 |
+
" \n",
|
1409 |
+
" # Reshape for decoder\n",
|
1410 |
+
" x = layers.Dense(2 * 2 * FILTERS*64)(z)\n",
|
1411 |
+
" x = layers.Reshape((2, 2, FILTERS*64))(x)\n",
|
1412 |
+
" \n",
|
1413 |
+
" # Decoder\n",
|
1414 |
+
" d0 = decoder_block(x, s7, FILTERS*64)\n",
|
1415 |
+
" d1 = decoder_block(d0, s6, FILTERS*32)\n",
|
1416 |
+
" d2 = decoder_block(d1, s5, FILTERS*16)\n",
|
1417 |
+
" d3 = decoder_block(d2, s4, FILTERS*8)\n",
|
1418 |
+
" d4 = decoder_block(d3, s3, FILTERS*4)\n",
|
1419 |
+
" d5 = decoder_block(d4, s2, FILTERS*2)\n",
|
1420 |
+
" d6 = decoder_block(d5, s1, FILTERS)\n",
|
1421 |
+
" \n",
|
1422 |
+
" outputs = layers.Conv2D(3, KERNEL, activation='sigmoid', padding='same')(d6)\n",
|
1423 |
+
" return Model(inputs, [outputs, z_mean, z_log_var], name=name)\n",
|
1424 |
+
"\n",
|
1425 |
+
"# ===================== Discriminator =====================\n",
|
1426 |
+
"def build_discriminator(name):\n",
|
1427 |
+
" inputs = layers.Input(IMAGE_SHAPE)\n",
|
1428 |
+
" \n",
|
1429 |
+
" # Feature extraction\n",
|
1430 |
+
" x = inputs\n",
|
1431 |
+
" features = []\n",
|
1432 |
+
" \n",
|
1433 |
+
" # Initial convolution\n",
|
1434 |
+
" x = layers.Conv2D(FILTERS, KERNEL, padding='same')(x)\n",
|
1435 |
+
" x = layers.LeakyReLU(alpha=0.2)(x)\n",
|
1436 |
+
" features.append(x)\n",
|
1437 |
+
" \n",
|
1438 |
+
" # Downsampling blocks\n",
|
1439 |
+
" filter_sizes = [FILTERS*2, FILTERS*4, FILTERS*8, FILTERS*16, FILTERS*32, FILTERS*64]\n",
|
1440 |
+
" for filters in filter_sizes:\n",
|
1441 |
+
" x, _ = encoder_block(x, filters, use_norm=False)\n",
|
1442 |
+
" features.append(x)\n",
|
1443 |
+
" \n",
|
1444 |
+
" # Multi-scale outputs\n",
|
1445 |
+
" outputs = []\n",
|
1446 |
+
" for i, feature in enumerate(features[-4:]):\n",
|
1447 |
+
" out = layers.Conv2D(1, KERNEL, padding='same')(feature)\n",
|
1448 |
+
" outputs.append(out)\n",
|
1449 |
+
" \n",
|
1450 |
+
" return Model(inputs, outputs, name=name)\n",
|
1451 |
+
"\n",
|
1452 |
+
"# ===================== Data Loading =====================\n",
|
1453 |
+
"def load_images(path):\n",
|
1454 |
+
" images = []\n",
|
1455 |
+
" for p in pathlib.Path(path).glob('*.*'):\n",
|
1456 |
+
" try:\n",
|
1457 |
+
" img = cv2.imread(str(p))\n",
|
1458 |
+
" if img is not None:\n",
|
1459 |
+
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
|
1460 |
+
" img = cv2.resize(img, IMAGE_SHAPE[:2])\n",
|
1461 |
+
" img = img.astype(np.float32) / 255.0\n",
|
1462 |
+
" images.append(img)\n",
|
1463 |
+
" except Exception as e:\n",
|
1464 |
+
" print(f\"Error loading image {p}: {e}\")\n",
|
1465 |
+
" return np.array(images)\n",
|
1466 |
+
"\n",
|
1467 |
+
"def load_and_balance_datasets(ct_path, mri_path):\n",
|
1468 |
+
" print(\"Loading CT scans...\")\n",
|
1469 |
+
" ct_scans = load_images(ct_path)\n",
|
1470 |
+
" print(\"Loading MRI scans...\")\n",
|
1471 |
+
" mri_scans = load_images(mri_path)\n",
|
1472 |
+
" \n",
|
1473 |
+
" min_length = min(len(ct_scans), len(mri_scans))\n",
|
1474 |
+
" ct_scans = ct_scans[:min_length]\n",
|
1475 |
+
" mri_scans = mri_scans[:min_length]\n",
|
1476 |
+
" \n",
|
1477 |
+
" print(f\"Balanced datasets to {min_length} images each\")\n",
|
1478 |
+
" return ct_scans, mri_scans\n",
|
1479 |
+
"\n",
|
1480 |
+
"# ===================== Training Setup =====================\n",
|
1481 |
+
"# Build models\n",
|
1482 |
+
"g_ct_mri = build_generator('CT_to_MRI')\n",
|
1483 |
+
"g_mri_ct = build_generator('MRI_to_CT')\n",
|
1484 |
+
"d_ct = build_discriminator('D_CT')\n",
|
1485 |
+
"d_mri = build_discriminator('D_MRI')\n",
|
1486 |
+
"\n",
|
1487 |
+
"# Create optimizers\n",
|
1488 |
+
"g_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
|
1489 |
+
"d_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
|
1490 |
+
"\n",
|
1491 |
+
"# Initialize model variables\n",
|
1492 |
+
"g_vars = g_ct_mri.trainable_variables + g_mri_ct.trainable_variables\n",
|
1493 |
+
"d_vars = d_ct.trainable_variables + d_mri.trainable_variables\n",
|
1494 |
+
"\n",
|
1495 |
+
"# Build optimizers\n",
|
1496 |
+
"g_opt.build(g_vars)\n",
|
1497 |
+
"d_opt.build(d_vars)\n",
|
1498 |
+
"\n",
|
1499 |
+
"# ===================== Training Function =====================\n",
|
1500 |
+
"@tf.function\n",
|
1501 |
+
"def train_step(real_ct, real_mri):\n",
|
1502 |
+
" with tf.GradientTape(persistent=True) as tape:\n",
|
1503 |
+
" # Forward passes\n",
|
1504 |
+
" fake_mri, z_mean_fwd, z_log_var_fwd = g_ct_mri(real_ct, training=True)\n",
|
1505 |
+
" fake_ct, z_mean_bwd, z_log_var_bwd = g_mri_ct(real_mri, training=True)\n",
|
1506 |
+
" \n",
|
1507 |
+
" # Discriminator outputs\n",
|
1508 |
+
" d_real_ct = d_ct(real_ct, training=True)\n",
|
1509 |
+
" d_fake_ct = d_ct(fake_ct, training=True)\n",
|
1510 |
+
" d_real_mri = d_mri(real_mri, training=True)\n",
|
1511 |
+
" d_fake_mri = d_mri(fake_mri, training=True)\n",
|
1512 |
+
" \n",
|
1513 |
+
" # Discriminator losses\n",
|
1514 |
+
" d_ct_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
|
1515 |
+
" for real, fake in zip(d_real_ct, d_fake_ct)])\n",
|
1516 |
+
" d_mri_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
|
1517 |
+
" for real, fake in zip(d_real_mri, d_fake_mri)])\n",
|
1518 |
+
" \n",
|
1519 |
+
" # Cycle consistency\n",
|
1520 |
+
" cycled_ct, _, _ = g_mri_ct(fake_mri, training=True)\n",
|
1521 |
+
" cycled_mri, _, _ = g_ct_mri(fake_ct, training=True)\n",
|
1522 |
+
" \n",
|
1523 |
+
" # KL Divergence\n",
|
1524 |
+
" kl_fwd = -0.5 * tf.reduce_mean(1 + z_log_var_fwd - tf.square(z_mean_fwd) - tf.exp(z_log_var_fwd))\n",
|
1525 |
+
" kl_bwd = -0.5 * tf.reduce_mean(1 + z_log_var_bwd - tf.square(z_mean_bwd) - tf.exp(z_log_var_bwd))\n",
|
1526 |
+
" \n",
|
1527 |
+
" # Generator losses\n",
|
1528 |
+
" g_adv_loss = sum([tf.reduce_mean((fake - 1)**2) for fake in d_fake_mri + d_fake_ct])\n",
|
1529 |
+
" g_cycle_loss = (tf.reduce_mean(tf.abs(real_ct - cycled_ct)) + \n",
|
1530 |
+
" tf.reduce_mean(tf.abs(real_mri - cycled_mri)))\n",
|
1531 |
+
" g_total_loss = g_adv_loss + 10 * g_cycle_loss + 0.5 * (kl_fwd + kl_bwd)\n",
|
1532 |
+
" \n",
|
1533 |
+
" # Total discriminator loss\n",
|
1534 |
+
" d_total_loss = d_ct_loss + d_mri_loss\n",
|
1535 |
+
" \n",
|
1536 |
+
" # Update discriminators\n",
|
1537 |
+
" d_grads = tape.gradient(d_total_loss, d_vars)\n",
|
1538 |
+
" d_opt.apply_gradients(zip(d_grads, d_vars))\n",
|
1539 |
+
" \n",
|
1540 |
+
" # Update generators\n",
|
1541 |
+
" g_grads = tape.gradient(g_total_loss, g_vars)\n",
|
1542 |
+
" g_opt.apply_gradients(zip(g_grads, g_vars))\n",
|
1543 |
+
" \n",
|
1544 |
+
" return {\n",
|
1545 |
+
" 'd_ct': d_ct_loss,\n",
|
1546 |
+
" 'd_mri': d_mri_loss,\n",
|
1547 |
+
" 'g_total': g_total_loss,\n",
|
1548 |
+
" 'fake_mri': fake_mri,\n",
|
1549 |
+
" 'fake_ct': fake_ct\n",
|
1550 |
+
" }\n",
|
1551 |
+
"\n",
|
1552 |
+
"\n",
|
1553 |
+
"\n",
|
1554 |
+
"import os\n",
|
1555 |
+
"\n",
|
1556 |
+
"def save_models(g_ct_mri, g_mri_ct, epoch, model_dir='/kaggle/working/saved_models'):\n",
|
1557 |
+
" \"\"\"Save models in HDF5 format after each epoch\"\"\"\n",
|
1558 |
+
" if not os.path.exists(model_dir):\n",
|
1559 |
+
" os.makedirs(model_dir)\n",
|
1560 |
+
" \n",
|
1561 |
+
" # Save as .h5 files\n",
|
1562 |
+
" ct_path = os.path.join(model_dir, f'ct_to_mri_epoch_{epoch}.h5')\n",
|
1563 |
+
" mri_path = os.path.join(model_dir, f'mri_to_ct_epoch_{epoch}.h5')\n",
|
1564 |
+
" \n",
|
1565 |
+
" g_ct_mri.save(ct_path)\n",
|
1566 |
+
" g_mri_ct.save(mri_path)\n",
|
1567 |
+
" print(f\"Models saved: {ct_path} and {mri_path}\")\n",
|
1568 |
+
"\n",
|
1569 |
+
"\n",
|
1570 |
+
"# ===================== Main Training Loop =====================\n",
|
1571 |
+
"# Create progress directory if it doesn't exist\n",
|
1572 |
+
"progress_dir = '/kaggle/working/progress'\n",
|
1573 |
+
"if not os.path.exists(progress_dir):\n",
|
1574 |
+
" os.makedirs(progress_dir)\n",
|
1575 |
+
"\n",
|
1576 |
+
"# Load and prepare data\n",
|
1577 |
+
"print(\"Loading datasets...\")\n",
|
1578 |
+
"ct_scans, mri_scans = load_and_balance_datasets('/kaggle/input/ct-to-mri-cgan/Dataset/images/trainA', \n",
|
1579 |
+
" '/kaggle/input/ct-to-mri-cgan/Dataset/images/trainB')\n",
|
1580 |
+
"\n",
|
1581 |
+
"# Create TensorFlow dataset\n",
|
1582 |
+
"train_dataset = tf.data.Dataset.from_tensor_slices((ct_scans, mri_scans))\n",
|
1583 |
+
"train_dataset = train_dataset.shuffle(buffer_size=len(ct_scans)).batch(BATCH_SIZE)\n",
|
1584 |
+
"# Training loop\n",
|
1585 |
+
"print(\"Starting training...\")\n",
|
1586 |
+
"for epoch in range(EPOCHS):\n",
|
1587 |
+
" for batch_idx, (ct_batch, mri_batch) in enumerate(train_dataset):\n",
|
1588 |
+
" results = train_step(ct_batch, mri_batch)\n",
|
1589 |
+
" \n",
|
1590 |
+
" if batch_idx % 10 == 0:\n",
|
1591 |
+
" print(f\"Epoch {epoch}, Batch {batch_idx}: \"\n",
|
1592 |
+
" f\"D_CT={float(results['d_ct']):.4f}, \"\n",
|
1593 |
+
" f\"D_MRI={float(results['d_mri']):.4f}, \"\n",
|
1594 |
+
" f\"G={float(results['g_total']):.4f}\")\n",
|
1595 |
+
" \n",
|
1596 |
+
" # Save sample images every 100 batches\n",
|
1597 |
+
" if batch_idx % 100 == 0:\n",
|
1598 |
+
" fig, axes = plt.subplots(2, 2, figsize=(10, 10))\n",
|
1599 |
+
" \n",
|
1600 |
+
" # Real CT and Fake MRI\n",
|
1601 |
+
" axes[0,0].imshow(ct_batch[0].numpy())\n",
|
1602 |
+
" axes[0,0].set_title(\"Real CT\")\n",
|
1603 |
+
" axes[0,0].axis('off')\n",
|
1604 |
+
" \n",
|
1605 |
+
" axes[0,1].imshow(results['fake_mri'][0].numpy())\n",
|
1606 |
+
" axes[0,1].set_title(\"Fake MRI\")\n",
|
1607 |
+
" axes[0,1].axis('off')\n",
|
1608 |
+
" \n",
|
1609 |
+
" # Real MRI and Fake CT\n",
|
1610 |
+
" axes[1,0].imshow(mri_batch[0].numpy())\n",
|
1611 |
+
" axes[1,0].set_title(\"Real MRI\")\n",
|
1612 |
+
" axes[1,0].axis('off')\n",
|
1613 |
+
" \n",
|
1614 |
+
" axes[1,1].imshow(results['fake_ct'][0].numpy())\n",
|
1615 |
+
" axes[1,1].set_title(\"Fake CT\")\n",
|
1616 |
+
" axes[1,1].axis('off')\n",
|
1617 |
+
" \n",
|
1618 |
+
" plt.tight_layout()\n",
|
1619 |
+
" plt.savefig(f'progress/epoch_{epoch}_batch_{batch_idx}.png')\n",
|
1620 |
+
" plt.close()\n",
|
1621 |
+
" \n",
|
1622 |
+
" # Save models after each epoch\n",
|
1623 |
+
" save_models(g_ct_mri, g_mri_ct, epoch)\n",
|
1624 |
+
"\n",
|
1625 |
+
"def load_and_preprocess_image(image_path):\n",
|
1626 |
+
" \"\"\"Load and preprocess a single image for inference\"\"\"\n",
|
1627 |
+
" # Read image\n",
|
1628 |
+
" img = cv2.imread(image_path)\n",
|
1629 |
+
" if img is None:\n",
|
1630 |
+
" raise ValueError(f\"Could not load image from {image_path}\")\n",
|
1631 |
+
" \n",
|
1632 |
+
" # Convert BGR to RGB\n",
|
1633 |
+
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
|
1634 |
+
" \n",
|
1635 |
+
" # Resize to model's input size\n",
|
1636 |
+
" img = cv2.resize(img, (256, 256))\n",
|
1637 |
+
" \n",
|
1638 |
+
" # Normalize to [0, 1]\n",
|
1639 |
+
" img = img.astype(np.float32) / 255.0\n",
|
1640 |
+
" \n",
|
1641 |
+
" # Add batch dimension\n",
|
1642 |
+
" img = np.expand_dims(img, axis=0)\n",
|
1643 |
+
" \n",
|
1644 |
+
" return img\n",
|
1645 |
+
"\n",
|
1646 |
+
"def translate_image(model_path, image_path, output_path, mode='ct_to_mri'):\n",
|
1647 |
+
" \"\"\"\n",
|
1648 |
+
" Translate a single image using the trained model\n",
|
1649 |
+
" \n",
|
1650 |
+
" Parameters:\n",
|
1651 |
+
" model_path: Path to the saved model\n",
|
1652 |
+
" image_path: Path to the input image\n",
|
1653 |
+
" output_path: Path to save the translated image\n",
|
1654 |
+
" mode: 'ct_to_mri' or 'mri_to_ct'\n",
|
1655 |
+
" \"\"\"\n",
|
1656 |
+
" # Load model\n",
|
1657 |
+
" print(f\"Loading model from {model_path}\")\n",
|
1658 |
+
" model = tf.keras.models.load_model(model_path, \n",
|
1659 |
+
" custom_objects={'Sampling': Sampling})\n",
|
1660 |
+
" \n",
|
1661 |
+
" # Load and preprocess image\n",
|
1662 |
+
" input_image = load_and_preprocess_image(image_path)\n",
|
1663 |
+
" \n",
|
1664 |
+
" # Generate translation\n",
|
1665 |
+
" print(\"Generating translation...\")\n",
|
1666 |
+
" translated_image, _, _ = model(input_image, training=False)\n",
|
1667 |
+
" \n",
|
1668 |
+
" # Convert to numpy and denormalize\n",
|
1669 |
+
" translated_image = translated_image.numpy()[0] * 255\n",
|
1670 |
+
" translated_image = translated_image.astype(np.uint8)\n",
|
1671 |
+
" \n",
|
1672 |
+
" # Save the result\n",
|
1673 |
+
" print(f\"Saving translated image to {output_path}\")\n",
|
1674 |
+
" plt.figure(figsize=(10, 5))\n",
|
1675 |
+
" \n",
|
1676 |
+
" plt.subplot(1, 2, 1)\n",
|
1677 |
+
" plt.title(\"Input Image\")\n",
|
1678 |
+
" plt.imshow(input_image[0])\n",
|
1679 |
+
" plt.axis('off')\n",
|
1680 |
+
" \n",
|
1681 |
+
" plt.subplot(1, 2, 2)\n",
|
1682 |
+
" plt.title(\"Translated Image\")\n",
|
1683 |
+
" plt.imshow(translated_image)\n",
|
1684 |
+
" plt.axis('off')\n",
|
1685 |
+
" \n",
|
1686 |
+
" plt.tight_layout()\n",
|
1687 |
+
" plt.savefig(output_path)\n",
|
1688 |
+
" plt.close()\n",
|
1689 |
+
" \n",
|
1690 |
+
" return translated_image\n",
|
1691 |
+
"'''\n",
|
1692 |
+
"# Example usage of the translation function\n",
|
1693 |
+
"def example_translation():\n",
|
1694 |
+
" \"\"\"Example of how to use the translation function\"\"\"\n",
|
1695 |
+
" # Paths\n",
|
1696 |
+
" ct_to_mri_model = 'saved_models/ct_to_mri_epoch_1000'\n",
|
1697 |
+
" mri_to_ct_model = 'saved_models/mri_to_ct_epoch_1000'\n",
|
1698 |
+
" \n",
|
1699 |
+
" # CT to MRI translation\n",
|
1700 |
+
" input_ct = 'path/to/your/ct_image.jpg'\n",
|
1701 |
+
" output_mri = 'results/translated_mri.png'\n",
|
1702 |
+
" translated_mri = translate_image(ct_to_mri_model, input_ct, output_mri, \n",
|
1703 |
+
" mode='ct_to_mri')\n",
|
1704 |
+
" \n",
|
1705 |
+
" # MRI to CT translation\n",
|
1706 |
+
" input_mri = 'path/to/your/mri_image.jpg'\n",
|
1707 |
+
" output_ct = 'results/translated_ct.png'\n",
|
1708 |
+
" translated_ct = translate_image(mri_to_ct_model, input_mri, output_ct, \n",
|
1709 |
+
" mode='mri_to_ct')'''"
|
1710 |
+
]
|
1711 |
+
},
|
1712 |
+
{
|
1713 |
+
"cell_type": "markdown",
|
1714 |
+
"metadata": {},
|
1715 |
+
"source": []
|
1716 |
+
},
|
1717 |
+
{
|
1718 |
+
"cell_type": "markdown",
|
1719 |
+
"metadata": {},
|
1720 |
+
"source": []
|
1721 |
+
},
|
1722 |
+
{
|
1723 |
+
"cell_type": "code",
|
1724 |
+
"execution_count": null,
|
1725 |
+
"metadata": {},
|
1726 |
+
"outputs": [],
|
1727 |
+
"source": []
|
1728 |
+
},
|
1729 |
+
{
|
1730 |
+
"cell_type": "code",
|
1731 |
+
"execution_count": null,
|
1732 |
+
"metadata": {},
|
1733 |
+
"outputs": [],
|
1734 |
+
"source": []
|
1735 |
+
}
|
1736 |
+
],
|
1737 |
+
"metadata": {
|
1738 |
+
"kernelspec": {
|
1739 |
+
"display_name": "image",
|
1740 |
+
"language": "python",
|
1741 |
+
"name": "python3"
|
1742 |
+
},
|
1743 |
+
"language_info": {
|
1744 |
+
"codemirror_mode": {
|
1745 |
+
"name": "ipython",
|
1746 |
+
"version": 3
|
1747 |
+
},
|
1748 |
+
"file_extension": ".py",
|
1749 |
+
"mimetype": "text/x-python",
|
1750 |
+
"name": "python",
|
1751 |
+
"nbconvert_exporter": "python",
|
1752 |
+
"pygments_lexer": "ipython3",
|
1753 |
+
"version": "3.10.11"
|
1754 |
+
}
|
1755 |
+
},
|
1756 |
+
"nbformat": 4,
|
1757 |
+
"nbformat_minor": 2
|
1758 |
+
}
|
demo.html
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Simple CSS Page</title>
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
font-family: Arial, sans-serif;
|
10 |
+
margin: 0;
|
11 |
+
padding: 0;
|
12 |
+
text-align: center;
|
13 |
+
background-color: #f4f4f4;
|
14 |
+
}
|
15 |
+
|
16 |
+
header {
|
17 |
+
background-color: #3498db;
|
18 |
+
color: white;
|
19 |
+
padding: 20px;
|
20 |
+
font-size: 24px;
|
21 |
+
|
22 |
+
}
|
23 |
+
|
24 |
+
section {
|
25 |
+
margin: 20px auto;
|
26 |
+
padding: 20px;
|
27 |
+
background-color: white;
|
28 |
+
border-radius: 10px;
|
29 |
+
|
30 |
+
max-width: 600px;
|
31 |
+
}
|
32 |
+
|
33 |
+
button {
|
34 |
+
background-color: #2ecc71;
|
35 |
+
color: white;
|
36 |
+
padding: 10px 20px;
|
37 |
+
border: none;
|
38 |
+
border-radius: 5px;
|
39 |
+
cursor: pointer;
|
40 |
+
font-size: 16px;
|
41 |
+
transition: background-color 0.3s ease;
|
42 |
+
}
|
43 |
+
|
44 |
+
button:hover {
|
45 |
+
background-color: #27ae60;
|
46 |
+
}
|
47 |
+
|
48 |
+
footer {
|
49 |
+
background-color: #333;
|
50 |
+
color: white;
|
51 |
+
padding: 10px;
|
52 |
+
margin-top: 20px;
|
53 |
+
font-size: 14px;
|
54 |
+
}
|
55 |
+
</style>
|
56 |
+
</head>
|
57 |
+
<body>
|
58 |
+
<header>Welcome to My Simple CSS Page</header>
|
59 |
+
<section>
|
60 |
+
<h2>Styled Elements</h2>
|
61 |
+
<p>This is a simple webpage demonstrating basic CSS styling.</p>
|
62 |
+
<button>Click Me</button>
|
63 |
+
</section>
|
64 |
+
<footer>© 2025 My Simple Page. All rights reserved.</footer>
|
65 |
+
</body>
|
66 |
+
</html>
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017-2021 Ingy döt Net
|
2 |
+
Copyright (c) 2006-2016 Kirill Simonov
|
3 |
+
|
4 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
5 |
+
this software and associated documentation files (the "Software"), to deal in
|
6 |
+
the Software without restriction, including without limitation the rights to
|
7 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
8 |
+
of the Software, and to permit persons to whom the Software is furnished to do
|
9 |
+
so, subject to the following conditions:
|
10 |
+
|
11 |
+
The above copyright notice and this permission notice shall be included in all
|
12 |
+
copies or substantial portions of the Software.
|
13 |
+
|
14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20 |
+
SOFTWARE.
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: PyYAML
|
3 |
+
Version: 6.0.2
|
4 |
+
Summary: YAML parser and emitter for Python
|
5 |
+
Home-page: https://pyyaml.org/
|
6 |
+
Download-URL: https://pypi.org/project/PyYAML/
|
7 |
+
Author: Kirill Simonov
|
8 |
+
Author-email: [email protected]
|
9 |
+
License: MIT
|
10 |
+
Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
|
11 |
+
Project-URL: CI, https://github.com/yaml/pyyaml/actions
|
12 |
+
Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
|
13 |
+
Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
|
14 |
+
Project-URL: Source Code, https://github.com/yaml/pyyaml
|
15 |
+
Platform: Any
|
16 |
+
Classifier: Development Status :: 5 - Production/Stable
|
17 |
+
Classifier: Intended Audience :: Developers
|
18 |
+
Classifier: License :: OSI Approved :: MIT License
|
19 |
+
Classifier: Operating System :: OS Independent
|
20 |
+
Classifier: Programming Language :: Cython
|
21 |
+
Classifier: Programming Language :: Python
|
22 |
+
Classifier: Programming Language :: Python :: 3
|
23 |
+
Classifier: Programming Language :: Python :: 3.8
|
24 |
+
Classifier: Programming Language :: Python :: 3.9
|
25 |
+
Classifier: Programming Language :: Python :: 3.10
|
26 |
+
Classifier: Programming Language :: Python :: 3.11
|
27 |
+
Classifier: Programming Language :: Python :: 3.12
|
28 |
+
Classifier: Programming Language :: Python :: 3.13
|
29 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
30 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
31 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
32 |
+
Classifier: Topic :: Text Processing :: Markup
|
33 |
+
Requires-Python: >=3.8
|
34 |
+
License-File: LICENSE
|
35 |
+
|
36 |
+
YAML is a data serialization format designed for human readability
|
37 |
+
and interaction with scripting languages. PyYAML is a YAML parser
|
38 |
+
and emitter for Python.
|
39 |
+
|
40 |
+
PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
|
41 |
+
support, capable extension API, and sensible error messages. PyYAML
|
42 |
+
supports standard YAML tags and provides Python-specific tags that
|
43 |
+
allow to represent an arbitrary Python object.
|
44 |
+
|
45 |
+
PyYAML is applicable for a broad range of tasks from complex
|
46 |
+
configuration files to object serialization and persistence.
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
|
3 |
+
PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
|
4 |
+
PyYAML-6.0.2.dist-info/RECORD,,
|
5 |
+
PyYAML-6.0.2.dist-info/WHEEL,sha256=c7SWG1_hRvc9HXHEkmWlTu1Jr4WpzRucfzqTP-_8q0s,102
|
6 |
+
PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
|
7 |
+
_yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
|
8 |
+
_yaml/__pycache__/__init__.cpython-312.pyc,,
|
9 |
+
yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
|
10 |
+
yaml/__pycache__/__init__.cpython-312.pyc,,
|
11 |
+
yaml/__pycache__/composer.cpython-312.pyc,,
|
12 |
+
yaml/__pycache__/constructor.cpython-312.pyc,,
|
13 |
+
yaml/__pycache__/cyaml.cpython-312.pyc,,
|
14 |
+
yaml/__pycache__/dumper.cpython-312.pyc,,
|
15 |
+
yaml/__pycache__/emitter.cpython-312.pyc,,
|
16 |
+
yaml/__pycache__/error.cpython-312.pyc,,
|
17 |
+
yaml/__pycache__/events.cpython-312.pyc,,
|
18 |
+
yaml/__pycache__/loader.cpython-312.pyc,,
|
19 |
+
yaml/__pycache__/nodes.cpython-312.pyc,,
|
20 |
+
yaml/__pycache__/parser.cpython-312.pyc,,
|
21 |
+
yaml/__pycache__/reader.cpython-312.pyc,,
|
22 |
+
yaml/__pycache__/representer.cpython-312.pyc,,
|
23 |
+
yaml/__pycache__/resolver.cpython-312.pyc,,
|
24 |
+
yaml/__pycache__/scanner.cpython-312.pyc,,
|
25 |
+
yaml/__pycache__/serializer.cpython-312.pyc,,
|
26 |
+
yaml/__pycache__/tokens.cpython-312.pyc,,
|
27 |
+
yaml/_yaml.cp312-win_amd64.pyd,sha256=Bx7e_LEQx7cnd1_A9_nClp3X77g-_Lw1aoAAtYZbwWk,263680
|
28 |
+
yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
|
29 |
+
yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
|
30 |
+
yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
|
31 |
+
yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
|
32 |
+
yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
|
33 |
+
yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
|
34 |
+
yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
|
35 |
+
yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
|
36 |
+
yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
|
37 |
+
yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
|
38 |
+
yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
|
39 |
+
yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
|
40 |
+
yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
|
41 |
+
yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
|
42 |
+
yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
|
43 |
+
yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.44.0)
|
3 |
+
Root-Is-Purelib: false
|
4 |
+
Tag: cp312-cp312-win_amd64
|
5 |
+
|
env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
_yaml
|
2 |
+
yaml
|
env/Lib/site-packages/_yaml/__init__.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a stub package designed to roughly emulate the _yaml
|
2 |
+
# extension module, which previously existed as a standalone module
|
3 |
+
# and has been moved into the `yaml` package namespace.
|
4 |
+
# It does not perfectly mimic its old counterpart, but should get
|
5 |
+
# close enough for anyone who's relying on it even when they shouldn't.
|
6 |
+
import yaml
|
7 |
+
|
8 |
+
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
|
9 |
+
# to tread carefully when poking at it here (it may not have the attributes we expect)
|
10 |
+
if not getattr(yaml, '__with_libyaml__', False):
|
11 |
+
from sys import version_info
|
12 |
+
|
13 |
+
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
|
14 |
+
raise exc("No module named '_yaml'")
|
15 |
+
else:
|
16 |
+
from yaml._yaml import *
|
17 |
+
import warnings
|
18 |
+
warnings.warn(
|
19 |
+
'The _yaml extension module is now located at yaml._yaml'
|
20 |
+
' and its location is subject to change. To use the'
|
21 |
+
' LibYAML-based parser and emitter, import from `yaml`:'
|
22 |
+
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
|
23 |
+
DeprecationWarning
|
24 |
+
)
|
25 |
+
del warnings
|
26 |
+
# Don't `del yaml` here because yaml is actually an existing
|
27 |
+
# namespace member of _yaml.
|
28 |
+
|
29 |
+
__name__ = '_yaml'
|
30 |
+
# If the module is top-level (i.e. not a part of any specific package)
|
31 |
+
# then the attribute should be set to ''.
|
32 |
+
# https://docs.python.org/3.8/library/types.html
|
33 |
+
__package__ = ''
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This package contains a modified version of ca-bundle.crt:
|
2 |
+
|
3 |
+
ca-bundle.crt -- Bundle of CA Root Certificates
|
4 |
+
|
5 |
+
This is a bundle of X.509 certificates of public Certificate Authorities
|
6 |
+
(CA). These were automatically extracted from Mozilla's root certificates
|
7 |
+
file (certdata.txt). This file can be found in the mozilla source tree:
|
8 |
+
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
9 |
+
It contains the certificates in PEM format and therefore
|
10 |
+
can be directly used with curl / libcurl / php_curl, or with
|
11 |
+
an Apache+mod_ssl webserver for SSL client authentication.
|
12 |
+
Just configure this file as the SSLCACertificateFile.#
|
13 |
+
|
14 |
+
***** BEGIN LICENSE BLOCK *****
|
15 |
+
This Source Code Form is subject to the terms of the Mozilla Public License,
|
16 |
+
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
17 |
+
one at http://mozilla.org/MPL/2.0/.
|
18 |
+
|
19 |
+
***** END LICENSE BLOCK *****
|
20 |
+
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.2
|
2 |
+
Name: certifi
|
3 |
+
Version: 2025.1.31
|
4 |
+
Summary: Python package for providing Mozilla's CA Bundle.
|
5 |
+
Home-page: https://github.com/certifi/python-certifi
|
6 |
+
Author: Kenneth Reitz
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MPL-2.0
|
9 |
+
Project-URL: Source, https://github.com/certifi/python-certifi
|
10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
13 |
+
Classifier: Natural Language :: English
|
14 |
+
Classifier: Programming Language :: Python
|
15 |
+
Classifier: Programming Language :: Python :: 3
|
16 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17 |
+
Classifier: Programming Language :: Python :: 3.6
|
18 |
+
Classifier: Programming Language :: Python :: 3.7
|
19 |
+
Classifier: Programming Language :: Python :: 3.8
|
20 |
+
Classifier: Programming Language :: Python :: 3.9
|
21 |
+
Classifier: Programming Language :: Python :: 3.10
|
22 |
+
Classifier: Programming Language :: Python :: 3.11
|
23 |
+
Classifier: Programming Language :: Python :: 3.12
|
24 |
+
Classifier: Programming Language :: Python :: 3.13
|
25 |
+
Requires-Python: >=3.6
|
26 |
+
License-File: LICENSE
|
27 |
+
Dynamic: author
|
28 |
+
Dynamic: author-email
|
29 |
+
Dynamic: classifier
|
30 |
+
Dynamic: description
|
31 |
+
Dynamic: home-page
|
32 |
+
Dynamic: license
|
33 |
+
Dynamic: project-url
|
34 |
+
Dynamic: requires-python
|
35 |
+
Dynamic: summary
|
36 |
+
|
37 |
+
Certifi: Python SSL Certificates
|
38 |
+
================================
|
39 |
+
|
40 |
+
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
41 |
+
validating the trustworthiness of SSL certificates while verifying the identity
|
42 |
+
of TLS hosts. It has been extracted from the `Requests`_ project.
|
43 |
+
|
44 |
+
Installation
|
45 |
+
------------
|
46 |
+
|
47 |
+
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
48 |
+
|
49 |
+
$ pip install certifi
|
50 |
+
|
51 |
+
Usage
|
52 |
+
-----
|
53 |
+
|
54 |
+
To reference the installed certificate authority (CA) bundle, you can use the
|
55 |
+
built-in function::
|
56 |
+
|
57 |
+
>>> import certifi
|
58 |
+
|
59 |
+
>>> certifi.where()
|
60 |
+
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
61 |
+
|
62 |
+
Or from the command line::
|
63 |
+
|
64 |
+
$ python -m certifi
|
65 |
+
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
66 |
+
|
67 |
+
Enjoy!
|
68 |
+
|
69 |
+
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
70 |
+
|
71 |
+
Addition/Removal of Certificates
|
72 |
+
--------------------------------
|
73 |
+
|
74 |
+
Certifi does not support any addition/removal or other modification of the
|
75 |
+
CA trust store content. This project is intended to provide a reliable and
|
76 |
+
highly portable root of trust to python deployments. Look to upstream projects
|
77 |
+
for methods to use alternate trust.
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
3 |
+
certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
|
4 |
+
certifi-2025.1.31.dist-info/RECORD,,
|
5 |
+
certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
6 |
+
certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
7 |
+
certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
|
8 |
+
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
9 |
+
certifi/__pycache__/__init__.cpython-312.pyc,,
|
10 |
+
certifi/__pycache__/__main__.cpython-312.pyc,,
|
11 |
+
certifi/__pycache__/core.cpython-312.pyc,,
|
12 |
+
certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
|
13 |
+
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
|
14 |
+
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: setuptools (75.8.0)
|
3 |
+
Root-Is-Purelib: true
|
4 |
+
Tag: py3-none-any
|
5 |
+
|
env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
certifi
|
env/Lib/site-packages/certifi/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .core import contents, where
|
2 |
+
|
3 |
+
__all__ = ["contents", "where"]
|
4 |
+
__version__ = "2025.01.31"
|
env/Lib/site-packages/certifi/__main__.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
from certifi import contents, where
|
4 |
+
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("-c", "--contents", action="store_true")
|
7 |
+
args = parser.parse_args()
|
8 |
+
|
9 |
+
if args.contents:
|
10 |
+
print(contents())
|
11 |
+
else:
|
12 |
+
print(where())
|
env/Lib/site-packages/certifi/cacert.pem
ADDED
The diff for this file is too large to render.
See raw diff
|
|
env/Lib/site-packages/certifi/core.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
certifi.py
|
3 |
+
~~~~~~~~~~
|
4 |
+
|
5 |
+
This module returns the installation location of cacert.pem or its contents.
|
6 |
+
"""
|
7 |
+
import sys
|
8 |
+
import atexit
|
9 |
+
|
10 |
+
def exit_cacert_ctx() -> None:
|
11 |
+
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
12 |
+
|
13 |
+
|
14 |
+
if sys.version_info >= (3, 11):
|
15 |
+
|
16 |
+
from importlib.resources import as_file, files
|
17 |
+
|
18 |
+
_CACERT_CTX = None
|
19 |
+
_CACERT_PATH = None
|
20 |
+
|
21 |
+
def where() -> str:
|
22 |
+
# This is slightly terrible, but we want to delay extracting the file
|
23 |
+
# in cases where we're inside of a zipimport situation until someone
|
24 |
+
# actually calls where(), but we don't want to re-extract the file
|
25 |
+
# on every call of where(), so we'll do it once then store it in a
|
26 |
+
# global variable.
|
27 |
+
global _CACERT_CTX
|
28 |
+
global _CACERT_PATH
|
29 |
+
if _CACERT_PATH is None:
|
30 |
+
# This is slightly janky, the importlib.resources API wants you to
|
31 |
+
# manage the cleanup of this file, so it doesn't actually return a
|
32 |
+
# path, it returns a context manager that will give you the path
|
33 |
+
# when you enter it and will do any cleanup when you leave it. In
|
34 |
+
# the common case of not needing a temporary file, it will just
|
35 |
+
# return the file system location and the __exit__() is a no-op.
|
36 |
+
#
|
37 |
+
# We also have to hold onto the actual context manager, because
|
38 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
39 |
+
# we will also store that at the global level as well.
|
40 |
+
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
41 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
42 |
+
atexit.register(exit_cacert_ctx)
|
43 |
+
|
44 |
+
return _CACERT_PATH
|
45 |
+
|
46 |
+
def contents() -> str:
|
47 |
+
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
48 |
+
|
49 |
+
elif sys.version_info >= (3, 7):
|
50 |
+
|
51 |
+
from importlib.resources import path as get_path, read_text
|
52 |
+
|
53 |
+
_CACERT_CTX = None
|
54 |
+
_CACERT_PATH = None
|
55 |
+
|
56 |
+
def where() -> str:
|
57 |
+
# This is slightly terrible, but we want to delay extracting the
|
58 |
+
# file in cases where we're inside of a zipimport situation until
|
59 |
+
# someone actually calls where(), but we don't want to re-extract
|
60 |
+
# the file on every call of where(), so we'll do it once then store
|
61 |
+
# it in a global variable.
|
62 |
+
global _CACERT_CTX
|
63 |
+
global _CACERT_PATH
|
64 |
+
if _CACERT_PATH is None:
|
65 |
+
# This is slightly janky, the importlib.resources API wants you
|
66 |
+
# to manage the cleanup of this file, so it doesn't actually
|
67 |
+
# return a path, it returns a context manager that will give
|
68 |
+
# you the path when you enter it and will do any cleanup when
|
69 |
+
# you leave it. In the common case of not needing a temporary
|
70 |
+
# file, it will just return the file system location and the
|
71 |
+
# __exit__() is a no-op.
|
72 |
+
#
|
73 |
+
# We also have to hold onto the actual context manager, because
|
74 |
+
# it will do the cleanup whenever it gets garbage collected, so
|
75 |
+
# we will also store that at the global level as well.
|
76 |
+
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
77 |
+
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
78 |
+
atexit.register(exit_cacert_ctx)
|
79 |
+
|
80 |
+
return _CACERT_PATH
|
81 |
+
|
82 |
+
def contents() -> str:
|
83 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
84 |
+
|
85 |
+
else:
|
86 |
+
import os
|
87 |
+
import types
|
88 |
+
from typing import Union
|
89 |
+
|
90 |
+
Package = Union[types.ModuleType, str]
|
91 |
+
Resource = Union[str, "os.PathLike"]
|
92 |
+
|
93 |
+
# This fallback will work for Python versions prior to 3.7 that lack the
|
94 |
+
# importlib.resources module but relies on the existing `where` function
|
95 |
+
# so won't address issues with environments like PyOxidizer that don't set
|
96 |
+
# __file__ on modules.
|
97 |
+
def read_text(
|
98 |
+
package: Package,
|
99 |
+
resource: Resource,
|
100 |
+
encoding: str = 'utf-8',
|
101 |
+
errors: str = 'strict'
|
102 |
+
) -> str:
|
103 |
+
with open(where(), encoding=encoding) as data:
|
104 |
+
return data.read()
|
105 |
+
|
106 |
+
# If we don't have importlib.resources, then we will just do the old logic
|
107 |
+
# of assuming we're on the filesystem and munge the path directly.
|
108 |
+
def where() -> str:
|
109 |
+
f = os.path.dirname(__file__)
|
110 |
+
|
111 |
+
return os.path.join(f, "cacert.pem")
|
112 |
+
|
113 |
+
def contents() -> str:
|
114 |
+
return read_text("certifi", "cacert.pem", encoding="ascii")
|
env/Lib/site-packages/certifi/py.typed
ADDED
File without changes
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA
ADDED
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: charset-normalizer
|
3 |
+
Version: 3.4.1
|
4 |
+
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
5 |
+
Author-email: "Ahmed R. TAHRI" <[email protected]>
|
6 |
+
Maintainer-email: "Ahmed R. TAHRI" <[email protected]>
|
7 |
+
License: MIT
|
8 |
+
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
9 |
+
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
10 |
+
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
11 |
+
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
12 |
+
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
14 |
+
Classifier: Intended Audience :: Developers
|
15 |
+
Classifier: License :: OSI Approved :: MIT License
|
16 |
+
Classifier: Operating System :: OS Independent
|
17 |
+
Classifier: Programming Language :: Python
|
18 |
+
Classifier: Programming Language :: Python :: 3
|
19 |
+
Classifier: Programming Language :: Python :: 3.7
|
20 |
+
Classifier: Programming Language :: Python :: 3.8
|
21 |
+
Classifier: Programming Language :: Python :: 3.9
|
22 |
+
Classifier: Programming Language :: Python :: 3.10
|
23 |
+
Classifier: Programming Language :: Python :: 3.11
|
24 |
+
Classifier: Programming Language :: Python :: 3.12
|
25 |
+
Classifier: Programming Language :: Python :: 3.13
|
26 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
27 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
28 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
29 |
+
Classifier: Topic :: Text Processing :: Linguistic
|
30 |
+
Classifier: Topic :: Utilities
|
31 |
+
Classifier: Typing :: Typed
|
32 |
+
Requires-Python: >=3.7
|
33 |
+
Description-Content-Type: text/markdown
|
34 |
+
License-File: LICENSE
|
35 |
+
Provides-Extra: unicode-backport
|
36 |
+
|
37 |
+
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
38 |
+
|
39 |
+
<p align="center">
|
40 |
+
<sup>The Real First Universal Charset Detector</sup><br>
|
41 |
+
<a href="https://pypi.org/project/charset-normalizer">
|
42 |
+
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
43 |
+
</a>
|
44 |
+
<a href="https://pepy.tech/project/charset-normalizer/">
|
45 |
+
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
46 |
+
</a>
|
47 |
+
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
48 |
+
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
49 |
+
</a>
|
50 |
+
</p>
|
51 |
+
<p align="center">
|
52 |
+
<sup><i>Featured Packages</i></sup><br>
|
53 |
+
<a href="https://github.com/jawah/niquests">
|
54 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
|
55 |
+
</a>
|
56 |
+
<a href="https://github.com/jawah/wassima">
|
57 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
|
58 |
+
</a>
|
59 |
+
</p>
|
60 |
+
<p align="center">
|
61 |
+
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
62 |
+
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
63 |
+
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
64 |
+
</a>
|
65 |
+
</p>
|
66 |
+
|
67 |
+
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
68 |
+
> I'm trying to resolve the issue by taking a new approach.
|
69 |
+
> All IANA character set names for which the Python core library provides codecs are supported.
|
70 |
+
|
71 |
+
<p align="center">
|
72 |
+
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
73 |
+
</p>
|
74 |
+
|
75 |
+
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
76 |
+
|
77 |
+
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
78 |
+
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
79 |
+
| `Fast` | ❌ | ✅ | ✅ |
|
80 |
+
| `Universal**` | ❌ | ✅ | ❌ |
|
81 |
+
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
82 |
+
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
83 |
+
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
84 |
+
| `Native Python` | ✅ | ✅ | ❌ |
|
85 |
+
| `Detect spoken language` | ❌ | ✅ | N/A |
|
86 |
+
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
87 |
+
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
88 |
+
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
89 |
+
|
90 |
+
<p align="center">
|
91 |
+
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
92 |
+
</p>
|
93 |
+
|
94 |
+
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
95 |
+
|
96 |
+
## ⚡ Performance
|
97 |
+
|
98 |
+
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
99 |
+
|
100 |
+
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
101 |
+
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
102 |
+
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
103 |
+
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
104 |
+
|
105 |
+
| Package | 99th percentile | 95th percentile | 50th percentile |
|
106 |
+
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
107 |
+
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
108 |
+
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
109 |
+
|
110 |
+
_updated as of december 2024 using CPython 3.12_
|
111 |
+
|
112 |
+
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
113 |
+
|
114 |
+
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
115 |
+
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
116 |
+
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
117 |
+
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
118 |
+
> (e.g. Supported Encoding) Challenge-them if you want.
|
119 |
+
|
120 |
+
## ✨ Installation
|
121 |
+
|
122 |
+
Using pip:
|
123 |
+
|
124 |
+
```sh
|
125 |
+
pip install charset-normalizer -U
|
126 |
+
```
|
127 |
+
|
128 |
+
## 🚀 Basic Usage
|
129 |
+
|
130 |
+
### CLI
|
131 |
+
This package comes with a CLI.
|
132 |
+
|
133 |
+
```
|
134 |
+
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
135 |
+
file [file ...]
|
136 |
+
|
137 |
+
The Real First Universal Charset Detector. Discover originating encoding used
|
138 |
+
on text file. Normalize text to unicode.
|
139 |
+
|
140 |
+
positional arguments:
|
141 |
+
files File(s) to be analysed
|
142 |
+
|
143 |
+
optional arguments:
|
144 |
+
-h, --help show this help message and exit
|
145 |
+
-v, --verbose Display complementary information about file if any.
|
146 |
+
Stdout will contain logs about the detection process.
|
147 |
+
-a, --with-alternative
|
148 |
+
Output complementary possibilities if any. Top-level
|
149 |
+
JSON WILL be a list.
|
150 |
+
-n, --normalize Permit to normalize input file. If not set, program
|
151 |
+
does not write anything.
|
152 |
+
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
153 |
+
JSON output.
|
154 |
+
-r, --replace Replace file when trying to normalize it instead of
|
155 |
+
creating a new one.
|
156 |
+
-f, --force Replace file without asking if you are sure, use this
|
157 |
+
flag with caution.
|
158 |
+
-t THRESHOLD, --threshold THRESHOLD
|
159 |
+
Define a custom maximum amount of chaos allowed in
|
160 |
+
decoded content. 0. <= chaos <= 1.
|
161 |
+
--version Show version information and exit.
|
162 |
+
```
|
163 |
+
|
164 |
+
```bash
|
165 |
+
normalizer ./data/sample.1.fr.srt
|
166 |
+
```
|
167 |
+
|
168 |
+
or
|
169 |
+
|
170 |
+
```bash
|
171 |
+
python -m charset_normalizer ./data/sample.1.fr.srt
|
172 |
+
```
|
173 |
+
|
174 |
+
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
175 |
+
|
176 |
+
```json
|
177 |
+
{
|
178 |
+
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
179 |
+
"encoding": "cp1252",
|
180 |
+
"encoding_aliases": [
|
181 |
+
"1252",
|
182 |
+
"windows_1252"
|
183 |
+
],
|
184 |
+
"alternative_encodings": [
|
185 |
+
"cp1254",
|
186 |
+
"cp1256",
|
187 |
+
"cp1258",
|
188 |
+
"iso8859_14",
|
189 |
+
"iso8859_15",
|
190 |
+
"iso8859_16",
|
191 |
+
"iso8859_3",
|
192 |
+
"iso8859_9",
|
193 |
+
"latin_1",
|
194 |
+
"mbcs"
|
195 |
+
],
|
196 |
+
"language": "French",
|
197 |
+
"alphabets": [
|
198 |
+
"Basic Latin",
|
199 |
+
"Latin-1 Supplement"
|
200 |
+
],
|
201 |
+
"has_sig_or_bom": false,
|
202 |
+
"chaos": 0.149,
|
203 |
+
"coherence": 97.152,
|
204 |
+
"unicode_path": null,
|
205 |
+
"is_preferred": true
|
206 |
+
}
|
207 |
+
```
|
208 |
+
|
209 |
+
### Python
|
210 |
+
*Just print out normalized text*
|
211 |
+
```python
|
212 |
+
from charset_normalizer import from_path
|
213 |
+
|
214 |
+
results = from_path('./my_subtitle.srt')
|
215 |
+
|
216 |
+
print(str(results.best()))
|
217 |
+
```
|
218 |
+
|
219 |
+
*Upgrade your code without effort*
|
220 |
+
```python
|
221 |
+
from charset_normalizer import detect
|
222 |
+
```
|
223 |
+
|
224 |
+
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
225 |
+
|
226 |
+
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
227 |
+
|
228 |
+
## 😇 Why
|
229 |
+
|
230 |
+
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
231 |
+
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
232 |
+
|
233 |
+
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
234 |
+
produce **two identical rendered string.**
|
235 |
+
What I want is to get readable text, the best I can.
|
236 |
+
|
237 |
+
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
238 |
+
|
239 |
+
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
240 |
+
|
241 |
+
## 🍰 How
|
242 |
+
|
243 |
+
- Discard all charset encoding table that could not fit the binary content.
|
244 |
+
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
245 |
+
- Extract matches with the lowest mess detected.
|
246 |
+
- Additionally, we measure coherence / probe for a language.
|
247 |
+
|
248 |
+
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
249 |
+
|
250 |
+
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
251 |
+
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
252 |
+
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
253 |
+
improve or rewrite it.
|
254 |
+
|
255 |
+
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
256 |
+
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
257 |
+
|
258 |
+
## ⚡ Known limitations
|
259 |
+
|
260 |
+
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
261 |
+
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
262 |
+
|
263 |
+
## ⚠️ About Python EOLs
|
264 |
+
|
265 |
+
**If you are running:**
|
266 |
+
|
267 |
+
- Python >=2.7,<3.5: Unsupported
|
268 |
+
- Python 3.5: charset-normalizer < 2.1
|
269 |
+
- Python 3.6: charset-normalizer < 3.1
|
270 |
+
- Python 3.7: charset-normalizer < 4.0
|
271 |
+
|
272 |
+
Upgrade your Python interpreter as soon as possible.
|
273 |
+
|
274 |
+
## 👤 Contributing
|
275 |
+
|
276 |
+
Contributions, issues and feature requests are very much welcome.<br />
|
277 |
+
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
278 |
+
|
279 |
+
## 📝 License
|
280 |
+
|
281 |
+
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
282 |
+
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
283 |
+
|
284 |
+
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
285 |
+
|
286 |
+
## 💼 For Enterprise
|
287 |
+
|
288 |
+
Professional support for charset-normalizer is available as part of the [Tidelift
|
289 |
+
Subscription][1]. Tidelift gives software development teams a single source for
|
290 |
+
purchasing and maintaining their software, with professional grade assurances
|
291 |
+
from the experts who know it best, while seamlessly integrating with existing
|
292 |
+
tools.
|
293 |
+
|
294 |
+
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
295 |
+
|
296 |
+
[](https://www.bestpractices.dev/projects/7297)
|
297 |
+
|
298 |
+
# Changelog
|
299 |
+
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
300 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
301 |
+
|
302 |
+
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
303 |
+
|
304 |
+
### Changed
|
305 |
+
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
306 |
+
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
307 |
+
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
308 |
+
|
309 |
+
### Added
|
310 |
+
- pre-commit configuration.
|
311 |
+
- noxfile.
|
312 |
+
|
313 |
+
### Removed
|
314 |
+
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
315 |
+
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
316 |
+
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
317 |
+
- Unused `utils.range_scan` function.
|
318 |
+
|
319 |
+
### Fixed
|
320 |
+
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
321 |
+
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
322 |
+
|
323 |
+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
324 |
+
|
325 |
+
### Added
|
326 |
+
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
327 |
+
- Support for Python 3.13 (#512)
|
328 |
+
|
329 |
+
### Fixed
|
330 |
+
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
331 |
+
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
332 |
+
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
333 |
+
|
334 |
+
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
335 |
+
|
336 |
+
### Fixed
|
337 |
+
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
338 |
+
- Regression on some detection case showcased in the documentation (#371)
|
339 |
+
|
340 |
+
### Added
|
341 |
+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
342 |
+
|
343 |
+
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
344 |
+
|
345 |
+
### Changed
|
346 |
+
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
347 |
+
- Improved the general detection reliability based on reports from the community
|
348 |
+
|
349 |
+
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
350 |
+
|
351 |
+
### Added
|
352 |
+
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
353 |
+
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
354 |
+
|
355 |
+
### Removed
|
356 |
+
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
357 |
+
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
358 |
+
|
359 |
+
### Changed
|
360 |
+
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
361 |
+
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
362 |
+
|
363 |
+
### Fixed
|
364 |
+
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
365 |
+
|
366 |
+
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
367 |
+
|
368 |
+
### Changed
|
369 |
+
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
370 |
+
- Minor improvement over the global detection reliability
|
371 |
+
|
372 |
+
### Added
|
373 |
+
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
374 |
+
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
375 |
+
- Explicit support for Python 3.12
|
376 |
+
|
377 |
+
### Fixed
|
378 |
+
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
379 |
+
|
380 |
+
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
381 |
+
|
382 |
+
### Added
|
383 |
+
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
384 |
+
|
385 |
+
### Removed
|
386 |
+
- Support for Python 3.6 (PR #260)
|
387 |
+
|
388 |
+
### Changed
|
389 |
+
- Optional speedup provided by mypy/c 1.0.1
|
390 |
+
|
391 |
+
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
392 |
+
|
393 |
+
### Fixed
|
394 |
+
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
395 |
+
|
396 |
+
### Changed
|
397 |
+
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
398 |
+
|
399 |
+
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
400 |
+
|
401 |
+
### Added
|
402 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
403 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
404 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
405 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
406 |
+
|
407 |
+
### Changed
|
408 |
+
- Build with static metadata using 'build' frontend
|
409 |
+
- Make the language detection stricter
|
410 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
411 |
+
|
412 |
+
### Fixed
|
413 |
+
- CLI with opt --normalize fail when using full path for files
|
414 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
415 |
+
- Sphinx warnings when generating the documentation
|
416 |
+
|
417 |
+
### Removed
|
418 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
419 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
420 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
421 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
422 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
423 |
+
- Breaking: Top-level function `normalize`
|
424 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
425 |
+
- Support for the backport `unicodedata2`
|
426 |
+
|
427 |
+
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
428 |
+
|
429 |
+
### Added
|
430 |
+
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
431 |
+
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
432 |
+
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
433 |
+
|
434 |
+
### Changed
|
435 |
+
- Build with static metadata using 'build' frontend
|
436 |
+
- Make the language detection stricter
|
437 |
+
|
438 |
+
### Fixed
|
439 |
+
- CLI with opt --normalize fail when using full path for files
|
440 |
+
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
441 |
+
|
442 |
+
### Removed
|
443 |
+
- Coherence detector no longer return 'Simple English' instead return 'English'
|
444 |
+
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
445 |
+
|
446 |
+
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
447 |
+
|
448 |
+
### Added
|
449 |
+
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
450 |
+
|
451 |
+
### Removed
|
452 |
+
- Breaking: Method `first()` and `best()` from CharsetMatch
|
453 |
+
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
454 |
+
|
455 |
+
### Fixed
|
456 |
+
- Sphinx warnings when generating the documentation
|
457 |
+
|
458 |
+
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
459 |
+
|
460 |
+
### Changed
|
461 |
+
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
462 |
+
|
463 |
+
### Removed
|
464 |
+
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
465 |
+
- Breaking: Top-level function `normalize`
|
466 |
+
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
467 |
+
- Support for the backport `unicodedata2`
|
468 |
+
|
469 |
+
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
470 |
+
|
471 |
+
### Deprecated
|
472 |
+
- Function `normalize` scheduled for removal in 3.0
|
473 |
+
|
474 |
+
### Changed
|
475 |
+
- Removed useless call to decode in fn is_unprintable (#206)
|
476 |
+
|
477 |
+
### Fixed
|
478 |
+
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
479 |
+
|
480 |
+
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
481 |
+
|
482 |
+
### Added
|
483 |
+
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
484 |
+
|
485 |
+
### Changed
|
486 |
+
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
487 |
+
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
488 |
+
|
489 |
+
### Fixed
|
490 |
+
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
491 |
+
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
492 |
+
|
493 |
+
### Removed
|
494 |
+
- Support for Python 3.5 (PR #192)
|
495 |
+
|
496 |
+
### Deprecated
|
497 |
+
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
498 |
+
|
499 |
+
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
500 |
+
|
501 |
+
### Fixed
|
502 |
+
- ASCII miss-detection on rare cases (PR #170)
|
503 |
+
|
504 |
+
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
505 |
+
|
506 |
+
### Added
|
507 |
+
- Explicit support for Python 3.11 (PR #164)
|
508 |
+
|
509 |
+
### Changed
|
510 |
+
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
511 |
+
|
512 |
+
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
513 |
+
|
514 |
+
### Fixed
|
515 |
+
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
516 |
+
|
517 |
+
### Changed
|
518 |
+
- Skipping the language-detection (CD) on ASCII (PR #155)
|
519 |
+
|
520 |
+
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
521 |
+
|
522 |
+
### Changed
|
523 |
+
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
524 |
+
|
525 |
+
### Fixed
|
526 |
+
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
527 |
+
|
528 |
+
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
529 |
+
### Changed
|
530 |
+
- Improvement over Vietnamese detection (PR #126)
|
531 |
+
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
532 |
+
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
533 |
+
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
534 |
+
- Code style as refactored by Sourcery-AI (PR #131)
|
535 |
+
- Minor adjustment on the MD around european words (PR #133)
|
536 |
+
- Remove and replace SRTs from assets / tests (PR #139)
|
537 |
+
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
538 |
+
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
539 |
+
|
540 |
+
### Fixed
|
541 |
+
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
542 |
+
- Avoid using too insignificant chunk (PR #137)
|
543 |
+
|
544 |
+
### Added
|
545 |
+
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
546 |
+
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
547 |
+
|
548 |
+
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
549 |
+
### Added
|
550 |
+
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
551 |
+
|
552 |
+
### Changed
|
553 |
+
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
554 |
+
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
555 |
+
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
556 |
+
- Various detection improvement (MD+CD) (PR #117)
|
557 |
+
|
558 |
+
### Removed
|
559 |
+
- Remove redundant logging entry about detected language(s) (PR #115)
|
560 |
+
|
561 |
+
### Fixed
|
562 |
+
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
563 |
+
|
564 |
+
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
565 |
+
### Fixed
|
566 |
+
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
567 |
+
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
568 |
+
|
569 |
+
### Changed
|
570 |
+
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
571 |
+
|
572 |
+
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
573 |
+
### Changed
|
574 |
+
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
575 |
+
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
576 |
+
- The Unicode detection is slightly improved (PR #93)
|
577 |
+
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
578 |
+
|
579 |
+
### Removed
|
580 |
+
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
581 |
+
|
582 |
+
### Fixed
|
583 |
+
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
584 |
+
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
585 |
+
- The MANIFEST.in was not exhaustive (PR #78)
|
586 |
+
|
587 |
+
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
588 |
+
### Fixed
|
589 |
+
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
590 |
+
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
591 |
+
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
592 |
+
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
593 |
+
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
594 |
+
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
595 |
+
|
596 |
+
### Changed
|
597 |
+
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
598 |
+
- Allow fallback on specified encoding if any (PR #71)
|
599 |
+
|
600 |
+
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
601 |
+
### Changed
|
602 |
+
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
603 |
+
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
604 |
+
|
605 |
+
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
606 |
+
### Fixed
|
607 |
+
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
608 |
+
|
609 |
+
### Changed
|
610 |
+
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
611 |
+
|
612 |
+
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
613 |
+
### Fixed
|
614 |
+
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
615 |
+
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
616 |
+
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
617 |
+
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
618 |
+
|
619 |
+
### Changed
|
620 |
+
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
621 |
+
|
622 |
+
### Added
|
623 |
+
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
624 |
+
|
625 |
+
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
626 |
+
### Changed
|
627 |
+
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
628 |
+
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
629 |
+
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
630 |
+
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
631 |
+
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
632 |
+
- utf_7 detection has been reinstated.
|
633 |
+
|
634 |
+
### Removed
|
635 |
+
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
636 |
+
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
637 |
+
- The exception hook on UnicodeDecodeError has been removed.
|
638 |
+
|
639 |
+
### Deprecated
|
640 |
+
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
641 |
+
|
642 |
+
### Fixed
|
643 |
+
- The CLI output used the relative path of the file(s). Should be absolute.
|
644 |
+
|
645 |
+
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
646 |
+
### Fixed
|
647 |
+
- Logger configuration/usage no longer conflict with others (PR #44)
|
648 |
+
|
649 |
+
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
650 |
+
### Removed
|
651 |
+
- Using standard logging instead of using the package loguru.
|
652 |
+
- Dropping nose test framework in favor of the maintained pytest.
|
653 |
+
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
654 |
+
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
655 |
+
- Stop support for UTF-7 that does not contain a SIG.
|
656 |
+
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
657 |
+
|
658 |
+
### Fixed
|
659 |
+
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
660 |
+
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
661 |
+
|
662 |
+
### Changed
|
663 |
+
- Improving the package final size by compressing frequencies.json.
|
664 |
+
- Huge improvement over the larges payload.
|
665 |
+
|
666 |
+
### Added
|
667 |
+
- CLI now produces JSON consumable output.
|
668 |
+
- Return ASCII if given sequences fit. Given reasonable confidence.
|
669 |
+
|
670 |
+
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
671 |
+
|
672 |
+
### Fixed
|
673 |
+
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
674 |
+
|
675 |
+
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
676 |
+
|
677 |
+
### Fixed
|
678 |
+
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
679 |
+
|
680 |
+
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
681 |
+
|
682 |
+
### Fixed
|
683 |
+
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
684 |
+
|
685 |
+
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
686 |
+
|
687 |
+
### Changed
|
688 |
+
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
689 |
+
|
690 |
+
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
691 |
+
|
692 |
+
### Fixed
|
693 |
+
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
694 |
+
|
695 |
+
### Changed
|
696 |
+
- Dependencies refactoring, constraints revised.
|
697 |
+
|
698 |
+
### Added
|
699 |
+
- Add python 3.9 and 3.10 to the supported interpreters
|
700 |
+
|
701 |
+
MIT License
|
702 |
+
|
703 |
+
Copyright (c) 2025 TAHRI Ahmed R.
|
704 |
+
|
705 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
706 |
+
of this software and associated documentation files (the "Software"), to deal
|
707 |
+
in the Software without restriction, including without limitation the rights
|
708 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
709 |
+
copies of the Software, and to permit persons to whom the Software is
|
710 |
+
furnished to do so, subject to the following conditions:
|
711 |
+
|
712 |
+
The above copyright notice and this permission notice shall be included in all
|
713 |
+
copies or substantial portions of the Software.
|
714 |
+
|
715 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
716 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
717 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
718 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
719 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
720 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
721 |
+
SOFTWARE.
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
../../Scripts/normalizer.exe,sha256=aGyf7WAVLi4gHrr8F-d9-4fQG9ifpfMEXEvLwyt8KjI,108411
|
2 |
+
charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
3 |
+
charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
|
4 |
+
charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
|
5 |
+
charset_normalizer-3.4.1.dist-info/RECORD,,
|
6 |
+
charset_normalizer-3.4.1.dist-info/WHEEL,sha256=pWXrJbnZSH-J-PhYmKs2XNn4DHCPNBYq965vsBJBFvA,101
|
7 |
+
charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
|
8 |
+
charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
9 |
+
charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
|
10 |
+
charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
|
11 |
+
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
|
12 |
+
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
|
13 |
+
charset_normalizer/__pycache__/api.cpython-312.pyc,,
|
14 |
+
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
|
15 |
+
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
|
16 |
+
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
|
17 |
+
charset_normalizer/__pycache__/md.cpython-312.pyc,,
|
18 |
+
charset_normalizer/__pycache__/models.cpython-312.pyc,,
|
19 |
+
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
|
20 |
+
charset_normalizer/__pycache__/version.cpython-312.pyc,,
|
21 |
+
charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
|
22 |
+
charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
|
23 |
+
charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
|
24 |
+
charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
|
25 |
+
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
|
26 |
+
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
|
27 |
+
charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
|
28 |
+
charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
|
29 |
+
charset_normalizer/md.cp312-win_amd64.pyd,sha256=XBGy--IKda7c3iBfvw_dovocqb2RSucmVtxvtlG_3tA,10752
|
30 |
+
charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
|
31 |
+
charset_normalizer/md__mypyc.cp312-win_amd64.pyd,sha256=_-jWSji0BgBVvrIHbmabYQNMBF4-xTusdO5mu6P8JsA,125440
|
32 |
+
charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
|
33 |
+
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34 |
+
charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
|
35 |
+
charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: setuptools (75.6.0)
|
3 |
+
Root-Is-Purelib: false
|
4 |
+
Tag: cp312-cp312-win_amd64
|
5 |
+
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[console_scripts]
|
2 |
+
normalizer = charset_normalizer:cli.cli_detect
|
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
charset_normalizer
|
env/Lib/site-packages/charset_normalizer/__init__.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Charset-Normalizer
|
3 |
+
~~~~~~~~~~~~~~
|
4 |
+
The Real First Universal Charset Detector.
|
5 |
+
A library that helps you read text from an unknown charset encoding.
|
6 |
+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
7 |
+
All IANA character set names for which the Python core library provides codecs are supported.
|
8 |
+
|
9 |
+
Basic usage:
|
10 |
+
>>> from charset_normalizer import from_bytes
|
11 |
+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
12 |
+
>>> best_guess = results.best()
|
13 |
+
>>> str(best_guess)
|
14 |
+
'Bсеки човек има право на образование. Oбразованието!'
|
15 |
+
|
16 |
+
Others methods and usages are available - see the full documentation
|
17 |
+
at <https://github.com/Ousret/charset_normalizer>.
|
18 |
+
:copyright: (c) 2021 by Ahmed TAHRI
|
19 |
+
:license: MIT, see LICENSE for more details.
|
20 |
+
"""
|
21 |
+
|
22 |
+
from __future__ import annotations
|
23 |
+
|
24 |
+
import logging
|
25 |
+
|
26 |
+
from .api import from_bytes, from_fp, from_path, is_binary
|
27 |
+
from .legacy import detect
|
28 |
+
from .models import CharsetMatch, CharsetMatches
|
29 |
+
from .utils import set_logging_handler
|
30 |
+
from .version import VERSION, __version__
|
31 |
+
|
32 |
+
__all__ = (
|
33 |
+
"from_fp",
|
34 |
+
"from_path",
|
35 |
+
"from_bytes",
|
36 |
+
"is_binary",
|
37 |
+
"detect",
|
38 |
+
"CharsetMatch",
|
39 |
+
"CharsetMatches",
|
40 |
+
"__version__",
|
41 |
+
"VERSION",
|
42 |
+
"set_logging_handler",
|
43 |
+
)
|
44 |
+
|
45 |
+
# Attach a NullHandler to the top level logger by default
|
46 |
+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
47 |
+
|
48 |
+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
env/Lib/site-packages/charset_normalizer/__main__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from .cli import cli_detect
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
cli_detect()
|
env/Lib/site-packages/charset_normalizer/api.py
ADDED
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from os import PathLike
|
5 |
+
from typing import BinaryIO
|
6 |
+
|
7 |
+
from .cd import (
|
8 |
+
coherence_ratio,
|
9 |
+
encoding_languages,
|
10 |
+
mb_encoding_languages,
|
11 |
+
merge_coherence_ratios,
|
12 |
+
)
|
13 |
+
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
14 |
+
from .md import mess_ratio
|
15 |
+
from .models import CharsetMatch, CharsetMatches
|
16 |
+
from .utils import (
|
17 |
+
any_specified_encoding,
|
18 |
+
cut_sequence_chunks,
|
19 |
+
iana_name,
|
20 |
+
identify_sig_or_bom,
|
21 |
+
is_cp_similar,
|
22 |
+
is_multi_byte_encoding,
|
23 |
+
should_strip_sig_or_bom,
|
24 |
+
)
|
25 |
+
|
26 |
+
logger = logging.getLogger("charset_normalizer")
|
27 |
+
explain_handler = logging.StreamHandler()
|
28 |
+
explain_handler.setFormatter(
|
29 |
+
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def from_bytes(
|
34 |
+
sequences: bytes | bytearray,
|
35 |
+
steps: int = 5,
|
36 |
+
chunk_size: int = 512,
|
37 |
+
threshold: float = 0.2,
|
38 |
+
cp_isolation: list[str] | None = None,
|
39 |
+
cp_exclusion: list[str] | None = None,
|
40 |
+
preemptive_behaviour: bool = True,
|
41 |
+
explain: bool = False,
|
42 |
+
language_threshold: float = 0.1,
|
43 |
+
enable_fallback: bool = True,
|
44 |
+
) -> CharsetMatches:
|
45 |
+
"""
|
46 |
+
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
47 |
+
If there is no results, it is a strong indicator that the source is binary/not text.
|
48 |
+
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
49 |
+
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
50 |
+
|
51 |
+
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
52 |
+
but never take it for granted. Can improve the performance.
|
53 |
+
|
54 |
+
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
55 |
+
purpose.
|
56 |
+
|
57 |
+
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
58 |
+
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
59 |
+
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
60 |
+
Custom logging format and handler can be set manually.
|
61 |
+
"""
|
62 |
+
|
63 |
+
if not isinstance(sequences, (bytearray, bytes)):
|
64 |
+
raise TypeError(
|
65 |
+
"Expected object of type bytes or bytearray, got: {}".format(
|
66 |
+
type(sequences)
|
67 |
+
)
|
68 |
+
)
|
69 |
+
|
70 |
+
if explain:
|
71 |
+
previous_logger_level: int = logger.level
|
72 |
+
logger.addHandler(explain_handler)
|
73 |
+
logger.setLevel(TRACE)
|
74 |
+
|
75 |
+
length: int = len(sequences)
|
76 |
+
|
77 |
+
if length == 0:
|
78 |
+
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
79 |
+
if explain: # Defensive: ensure exit path clean handler
|
80 |
+
logger.removeHandler(explain_handler)
|
81 |
+
logger.setLevel(previous_logger_level or logging.WARNING)
|
82 |
+
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
83 |
+
|
84 |
+
if cp_isolation is not None:
|
85 |
+
logger.log(
|
86 |
+
TRACE,
|
87 |
+
"cp_isolation is set. use this flag for debugging purpose. "
|
88 |
+
"limited list of encoding allowed : %s.",
|
89 |
+
", ".join(cp_isolation),
|
90 |
+
)
|
91 |
+
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
92 |
+
else:
|
93 |
+
cp_isolation = []
|
94 |
+
|
95 |
+
if cp_exclusion is not None:
|
96 |
+
logger.log(
|
97 |
+
TRACE,
|
98 |
+
"cp_exclusion is set. use this flag for debugging purpose. "
|
99 |
+
"limited list of encoding excluded : %s.",
|
100 |
+
", ".join(cp_exclusion),
|
101 |
+
)
|
102 |
+
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
103 |
+
else:
|
104 |
+
cp_exclusion = []
|
105 |
+
|
106 |
+
if length <= (chunk_size * steps):
|
107 |
+
logger.log(
|
108 |
+
TRACE,
|
109 |
+
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
110 |
+
steps,
|
111 |
+
chunk_size,
|
112 |
+
length,
|
113 |
+
)
|
114 |
+
steps = 1
|
115 |
+
chunk_size = length
|
116 |
+
|
117 |
+
if steps > 1 and length / steps < chunk_size:
|
118 |
+
chunk_size = int(length / steps)
|
119 |
+
|
120 |
+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
121 |
+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
122 |
+
|
123 |
+
if is_too_small_sequence:
|
124 |
+
logger.log(
|
125 |
+
TRACE,
|
126 |
+
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
127 |
+
length
|
128 |
+
),
|
129 |
+
)
|
130 |
+
elif is_too_large_sequence:
|
131 |
+
logger.log(
|
132 |
+
TRACE,
|
133 |
+
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
134 |
+
length
|
135 |
+
),
|
136 |
+
)
|
137 |
+
|
138 |
+
prioritized_encodings: list[str] = []
|
139 |
+
|
140 |
+
specified_encoding: str | None = (
|
141 |
+
any_specified_encoding(sequences) if preemptive_behaviour else None
|
142 |
+
)
|
143 |
+
|
144 |
+
if specified_encoding is not None:
|
145 |
+
prioritized_encodings.append(specified_encoding)
|
146 |
+
logger.log(
|
147 |
+
TRACE,
|
148 |
+
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
149 |
+
specified_encoding,
|
150 |
+
)
|
151 |
+
|
152 |
+
tested: set[str] = set()
|
153 |
+
tested_but_hard_failure: list[str] = []
|
154 |
+
tested_but_soft_failure: list[str] = []
|
155 |
+
|
156 |
+
fallback_ascii: CharsetMatch | None = None
|
157 |
+
fallback_u8: CharsetMatch | None = None
|
158 |
+
fallback_specified: CharsetMatch | None = None
|
159 |
+
|
160 |
+
results: CharsetMatches = CharsetMatches()
|
161 |
+
|
162 |
+
early_stop_results: CharsetMatches = CharsetMatches()
|
163 |
+
|
164 |
+
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
165 |
+
|
166 |
+
if sig_encoding is not None:
|
167 |
+
prioritized_encodings.append(sig_encoding)
|
168 |
+
logger.log(
|
169 |
+
TRACE,
|
170 |
+
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
171 |
+
len(sig_payload),
|
172 |
+
sig_encoding,
|
173 |
+
)
|
174 |
+
|
175 |
+
prioritized_encodings.append("ascii")
|
176 |
+
|
177 |
+
if "utf_8" not in prioritized_encodings:
|
178 |
+
prioritized_encodings.append("utf_8")
|
179 |
+
|
180 |
+
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
181 |
+
if cp_isolation and encoding_iana not in cp_isolation:
|
182 |
+
continue
|
183 |
+
|
184 |
+
if cp_exclusion and encoding_iana in cp_exclusion:
|
185 |
+
continue
|
186 |
+
|
187 |
+
if encoding_iana in tested:
|
188 |
+
continue
|
189 |
+
|
190 |
+
tested.add(encoding_iana)
|
191 |
+
|
192 |
+
decoded_payload: str | None = None
|
193 |
+
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
194 |
+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
195 |
+
encoding_iana
|
196 |
+
)
|
197 |
+
|
198 |
+
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
199 |
+
logger.log(
|
200 |
+
TRACE,
|
201 |
+
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
202 |
+
encoding_iana,
|
203 |
+
)
|
204 |
+
continue
|
205 |
+
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
206 |
+
logger.log(
|
207 |
+
TRACE,
|
208 |
+
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
209 |
+
encoding_iana,
|
210 |
+
)
|
211 |
+
continue
|
212 |
+
|
213 |
+
try:
|
214 |
+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
215 |
+
except (ModuleNotFoundError, ImportError):
|
216 |
+
logger.log(
|
217 |
+
TRACE,
|
218 |
+
"Encoding %s does not provide an IncrementalDecoder",
|
219 |
+
encoding_iana,
|
220 |
+
)
|
221 |
+
continue
|
222 |
+
|
223 |
+
try:
|
224 |
+
if is_too_large_sequence and is_multi_byte_decoder is False:
|
225 |
+
str(
|
226 |
+
(
|
227 |
+
sequences[: int(50e4)]
|
228 |
+
if strip_sig_or_bom is False
|
229 |
+
else sequences[len(sig_payload) : int(50e4)]
|
230 |
+
),
|
231 |
+
encoding=encoding_iana,
|
232 |
+
)
|
233 |
+
else:
|
234 |
+
decoded_payload = str(
|
235 |
+
(
|
236 |
+
sequences
|
237 |
+
if strip_sig_or_bom is False
|
238 |
+
else sequences[len(sig_payload) :]
|
239 |
+
),
|
240 |
+
encoding=encoding_iana,
|
241 |
+
)
|
242 |
+
except (UnicodeDecodeError, LookupError) as e:
|
243 |
+
if not isinstance(e, LookupError):
|
244 |
+
logger.log(
|
245 |
+
TRACE,
|
246 |
+
"Code page %s does not fit given bytes sequence at ALL. %s",
|
247 |
+
encoding_iana,
|
248 |
+
str(e),
|
249 |
+
)
|
250 |
+
tested_but_hard_failure.append(encoding_iana)
|
251 |
+
continue
|
252 |
+
|
253 |
+
similar_soft_failure_test: bool = False
|
254 |
+
|
255 |
+
for encoding_soft_failed in tested_but_soft_failure:
|
256 |
+
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
257 |
+
similar_soft_failure_test = True
|
258 |
+
break
|
259 |
+
|
260 |
+
if similar_soft_failure_test:
|
261 |
+
logger.log(
|
262 |
+
TRACE,
|
263 |
+
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
264 |
+
encoding_iana,
|
265 |
+
encoding_soft_failed,
|
266 |
+
)
|
267 |
+
continue
|
268 |
+
|
269 |
+
r_ = range(
|
270 |
+
0 if not bom_or_sig_available else len(sig_payload),
|
271 |
+
length,
|
272 |
+
int(length / steps),
|
273 |
+
)
|
274 |
+
|
275 |
+
multi_byte_bonus: bool = (
|
276 |
+
is_multi_byte_decoder
|
277 |
+
and decoded_payload is not None
|
278 |
+
and len(decoded_payload) < length
|
279 |
+
)
|
280 |
+
|
281 |
+
if multi_byte_bonus:
|
282 |
+
logger.log(
|
283 |
+
TRACE,
|
284 |
+
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
285 |
+
"was encoded using n-bytes.",
|
286 |
+
encoding_iana,
|
287 |
+
)
|
288 |
+
|
289 |
+
max_chunk_gave_up: int = int(len(r_) / 4)
|
290 |
+
|
291 |
+
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
292 |
+
early_stop_count: int = 0
|
293 |
+
lazy_str_hard_failure = False
|
294 |
+
|
295 |
+
md_chunks: list[str] = []
|
296 |
+
md_ratios = []
|
297 |
+
|
298 |
+
try:
|
299 |
+
for chunk in cut_sequence_chunks(
|
300 |
+
sequences,
|
301 |
+
encoding_iana,
|
302 |
+
r_,
|
303 |
+
chunk_size,
|
304 |
+
bom_or_sig_available,
|
305 |
+
strip_sig_or_bom,
|
306 |
+
sig_payload,
|
307 |
+
is_multi_byte_decoder,
|
308 |
+
decoded_payload,
|
309 |
+
):
|
310 |
+
md_chunks.append(chunk)
|
311 |
+
|
312 |
+
md_ratios.append(
|
313 |
+
mess_ratio(
|
314 |
+
chunk,
|
315 |
+
threshold,
|
316 |
+
explain is True and 1 <= len(cp_isolation) <= 2,
|
317 |
+
)
|
318 |
+
)
|
319 |
+
|
320 |
+
if md_ratios[-1] >= threshold:
|
321 |
+
early_stop_count += 1
|
322 |
+
|
323 |
+
if (early_stop_count >= max_chunk_gave_up) or (
|
324 |
+
bom_or_sig_available and strip_sig_or_bom is False
|
325 |
+
):
|
326 |
+
break
|
327 |
+
except (
|
328 |
+
UnicodeDecodeError
|
329 |
+
) as e: # Lazy str loading may have missed something there
|
330 |
+
logger.log(
|
331 |
+
TRACE,
|
332 |
+
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
333 |
+
encoding_iana,
|
334 |
+
str(e),
|
335 |
+
)
|
336 |
+
early_stop_count = max_chunk_gave_up
|
337 |
+
lazy_str_hard_failure = True
|
338 |
+
|
339 |
+
# We might want to check the sequence again with the whole content
|
340 |
+
# Only if initial MD tests passes
|
341 |
+
if (
|
342 |
+
not lazy_str_hard_failure
|
343 |
+
and is_too_large_sequence
|
344 |
+
and not is_multi_byte_decoder
|
345 |
+
):
|
346 |
+
try:
|
347 |
+
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
348 |
+
except UnicodeDecodeError as e:
|
349 |
+
logger.log(
|
350 |
+
TRACE,
|
351 |
+
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
352 |
+
encoding_iana,
|
353 |
+
str(e),
|
354 |
+
)
|
355 |
+
tested_but_hard_failure.append(encoding_iana)
|
356 |
+
continue
|
357 |
+
|
358 |
+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
359 |
+
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
360 |
+
tested_but_soft_failure.append(encoding_iana)
|
361 |
+
logger.log(
|
362 |
+
TRACE,
|
363 |
+
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
364 |
+
"Computed mean chaos is %f %%.",
|
365 |
+
encoding_iana,
|
366 |
+
early_stop_count,
|
367 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
368 |
+
)
|
369 |
+
# Preparing those fallbacks in case we got nothing.
|
370 |
+
if (
|
371 |
+
enable_fallback
|
372 |
+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
|
373 |
+
and not lazy_str_hard_failure
|
374 |
+
):
|
375 |
+
fallback_entry = CharsetMatch(
|
376 |
+
sequences,
|
377 |
+
encoding_iana,
|
378 |
+
threshold,
|
379 |
+
False,
|
380 |
+
[],
|
381 |
+
decoded_payload,
|
382 |
+
preemptive_declaration=specified_encoding,
|
383 |
+
)
|
384 |
+
if encoding_iana == specified_encoding:
|
385 |
+
fallback_specified = fallback_entry
|
386 |
+
elif encoding_iana == "ascii":
|
387 |
+
fallback_ascii = fallback_entry
|
388 |
+
else:
|
389 |
+
fallback_u8 = fallback_entry
|
390 |
+
continue
|
391 |
+
|
392 |
+
logger.log(
|
393 |
+
TRACE,
|
394 |
+
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
395 |
+
encoding_iana,
|
396 |
+
round(mean_mess_ratio * 100, ndigits=3),
|
397 |
+
)
|
398 |
+
|
399 |
+
if not is_multi_byte_decoder:
|
400 |
+
target_languages: list[str] = encoding_languages(encoding_iana)
|
401 |
+
else:
|
402 |
+
target_languages = mb_encoding_languages(encoding_iana)
|
403 |
+
|
404 |
+
if target_languages:
|
405 |
+
logger.log(
|
406 |
+
TRACE,
|
407 |
+
"{} should target any language(s) of {}".format(
|
408 |
+
encoding_iana, str(target_languages)
|
409 |
+
),
|
410 |
+
)
|
411 |
+
|
412 |
+
cd_ratios = []
|
413 |
+
|
414 |
+
# We shall skip the CD when its about ASCII
|
415 |
+
# Most of the time its not relevant to run "language-detection" on it.
|
416 |
+
if encoding_iana != "ascii":
|
417 |
+
for chunk in md_chunks:
|
418 |
+
chunk_languages = coherence_ratio(
|
419 |
+
chunk,
|
420 |
+
language_threshold,
|
421 |
+
",".join(target_languages) if target_languages else None,
|
422 |
+
)
|
423 |
+
|
424 |
+
cd_ratios.append(chunk_languages)
|
425 |
+
|
426 |
+
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
427 |
+
|
428 |
+
if cd_ratios_merged:
|
429 |
+
logger.log(
|
430 |
+
TRACE,
|
431 |
+
"We detected language {} using {}".format(
|
432 |
+
cd_ratios_merged, encoding_iana
|
433 |
+
),
|
434 |
+
)
|
435 |
+
|
436 |
+
current_match = CharsetMatch(
|
437 |
+
sequences,
|
438 |
+
encoding_iana,
|
439 |
+
mean_mess_ratio,
|
440 |
+
bom_or_sig_available,
|
441 |
+
cd_ratios_merged,
|
442 |
+
(
|
443 |
+
decoded_payload
|
444 |
+
if (
|
445 |
+
is_too_large_sequence is False
|
446 |
+
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
447 |
+
)
|
448 |
+
else None
|
449 |
+
),
|
450 |
+
preemptive_declaration=specified_encoding,
|
451 |
+
)
|
452 |
+
|
453 |
+
results.append(current_match)
|
454 |
+
|
455 |
+
if (
|
456 |
+
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
457 |
+
and mean_mess_ratio < 0.1
|
458 |
+
):
|
459 |
+
# If md says nothing to worry about, then... stop immediately!
|
460 |
+
if mean_mess_ratio == 0.0:
|
461 |
+
logger.debug(
|
462 |
+
"Encoding detection: %s is most likely the one.",
|
463 |
+
current_match.encoding,
|
464 |
+
)
|
465 |
+
if explain: # Defensive: ensure exit path clean handler
|
466 |
+
logger.removeHandler(explain_handler)
|
467 |
+
logger.setLevel(previous_logger_level)
|
468 |
+
return CharsetMatches([current_match])
|
469 |
+
|
470 |
+
early_stop_results.append(current_match)
|
471 |
+
|
472 |
+
if (
|
473 |
+
len(early_stop_results)
|
474 |
+
and (specified_encoding is None or specified_encoding in tested)
|
475 |
+
and "ascii" in tested
|
476 |
+
and "utf_8" in tested
|
477 |
+
):
|
478 |
+
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
479 |
+
logger.debug(
|
480 |
+
"Encoding detection: %s is most likely the one.",
|
481 |
+
probable_result.encoding,
|
482 |
+
)
|
483 |
+
if explain: # Defensive: ensure exit path clean handler
|
484 |
+
logger.removeHandler(explain_handler)
|
485 |
+
logger.setLevel(previous_logger_level)
|
486 |
+
|
487 |
+
return CharsetMatches([probable_result])
|
488 |
+
|
489 |
+
if encoding_iana == sig_encoding:
|
490 |
+
logger.debug(
|
491 |
+
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
492 |
+
"the beginning of the sequence.",
|
493 |
+
encoding_iana,
|
494 |
+
)
|
495 |
+
if explain: # Defensive: ensure exit path clean handler
|
496 |
+
logger.removeHandler(explain_handler)
|
497 |
+
logger.setLevel(previous_logger_level)
|
498 |
+
return CharsetMatches([results[encoding_iana]])
|
499 |
+
|
500 |
+
if len(results) == 0:
|
501 |
+
if fallback_u8 or fallback_ascii or fallback_specified:
|
502 |
+
logger.log(
|
503 |
+
TRACE,
|
504 |
+
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
505 |
+
)
|
506 |
+
|
507 |
+
if fallback_specified:
|
508 |
+
logger.debug(
|
509 |
+
"Encoding detection: %s will be used as a fallback match",
|
510 |
+
fallback_specified.encoding,
|
511 |
+
)
|
512 |
+
results.append(fallback_specified)
|
513 |
+
elif (
|
514 |
+
(fallback_u8 and fallback_ascii is None)
|
515 |
+
or (
|
516 |
+
fallback_u8
|
517 |
+
and fallback_ascii
|
518 |
+
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
519 |
+
)
|
520 |
+
or (fallback_u8 is not None)
|
521 |
+
):
|
522 |
+
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
523 |
+
results.append(fallback_u8)
|
524 |
+
elif fallback_ascii:
|
525 |
+
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
526 |
+
results.append(fallback_ascii)
|
527 |
+
|
528 |
+
if results:
|
529 |
+
logger.debug(
|
530 |
+
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
531 |
+
results.best().encoding, # type: ignore
|
532 |
+
len(results) - 1,
|
533 |
+
)
|
534 |
+
else:
|
535 |
+
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
536 |
+
|
537 |
+
if explain:
|
538 |
+
logger.removeHandler(explain_handler)
|
539 |
+
logger.setLevel(previous_logger_level)
|
540 |
+
|
541 |
+
return results
|
542 |
+
|
543 |
+
|
544 |
+
def from_fp(
|
545 |
+
fp: BinaryIO,
|
546 |
+
steps: int = 5,
|
547 |
+
chunk_size: int = 512,
|
548 |
+
threshold: float = 0.20,
|
549 |
+
cp_isolation: list[str] | None = None,
|
550 |
+
cp_exclusion: list[str] | None = None,
|
551 |
+
preemptive_behaviour: bool = True,
|
552 |
+
explain: bool = False,
|
553 |
+
language_threshold: float = 0.1,
|
554 |
+
enable_fallback: bool = True,
|
555 |
+
) -> CharsetMatches:
|
556 |
+
"""
|
557 |
+
Same thing than the function from_bytes but using a file pointer that is already ready.
|
558 |
+
Will not close the file pointer.
|
559 |
+
"""
|
560 |
+
return from_bytes(
|
561 |
+
fp.read(),
|
562 |
+
steps,
|
563 |
+
chunk_size,
|
564 |
+
threshold,
|
565 |
+
cp_isolation,
|
566 |
+
cp_exclusion,
|
567 |
+
preemptive_behaviour,
|
568 |
+
explain,
|
569 |
+
language_threshold,
|
570 |
+
enable_fallback,
|
571 |
+
)
|
572 |
+
|
573 |
+
|
574 |
+
def from_path(
|
575 |
+
path: str | bytes | PathLike, # type: ignore[type-arg]
|
576 |
+
steps: int = 5,
|
577 |
+
chunk_size: int = 512,
|
578 |
+
threshold: float = 0.20,
|
579 |
+
cp_isolation: list[str] | None = None,
|
580 |
+
cp_exclusion: list[str] | None = None,
|
581 |
+
preemptive_behaviour: bool = True,
|
582 |
+
explain: bool = False,
|
583 |
+
language_threshold: float = 0.1,
|
584 |
+
enable_fallback: bool = True,
|
585 |
+
) -> CharsetMatches:
|
586 |
+
"""
|
587 |
+
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
588 |
+
Can raise IOError.
|
589 |
+
"""
|
590 |
+
with open(path, "rb") as fp:
|
591 |
+
return from_fp(
|
592 |
+
fp,
|
593 |
+
steps,
|
594 |
+
chunk_size,
|
595 |
+
threshold,
|
596 |
+
cp_isolation,
|
597 |
+
cp_exclusion,
|
598 |
+
preemptive_behaviour,
|
599 |
+
explain,
|
600 |
+
language_threshold,
|
601 |
+
enable_fallback,
|
602 |
+
)
|
603 |
+
|
604 |
+
|
605 |
+
def is_binary(
|
606 |
+
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
607 |
+
steps: int = 5,
|
608 |
+
chunk_size: int = 512,
|
609 |
+
threshold: float = 0.20,
|
610 |
+
cp_isolation: list[str] | None = None,
|
611 |
+
cp_exclusion: list[str] | None = None,
|
612 |
+
preemptive_behaviour: bool = True,
|
613 |
+
explain: bool = False,
|
614 |
+
language_threshold: float = 0.1,
|
615 |
+
enable_fallback: bool = False,
|
616 |
+
) -> bool:
|
617 |
+
"""
|
618 |
+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
619 |
+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
620 |
+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
621 |
+
"""
|
622 |
+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
623 |
+
guesses = from_path(
|
624 |
+
fp_or_path_or_payload,
|
625 |
+
steps=steps,
|
626 |
+
chunk_size=chunk_size,
|
627 |
+
threshold=threshold,
|
628 |
+
cp_isolation=cp_isolation,
|
629 |
+
cp_exclusion=cp_exclusion,
|
630 |
+
preemptive_behaviour=preemptive_behaviour,
|
631 |
+
explain=explain,
|
632 |
+
language_threshold=language_threshold,
|
633 |
+
enable_fallback=enable_fallback,
|
634 |
+
)
|
635 |
+
elif isinstance(
|
636 |
+
fp_or_path_or_payload,
|
637 |
+
(
|
638 |
+
bytes,
|
639 |
+
bytearray,
|
640 |
+
),
|
641 |
+
):
|
642 |
+
guesses = from_bytes(
|
643 |
+
fp_or_path_or_payload,
|
644 |
+
steps=steps,
|
645 |
+
chunk_size=chunk_size,
|
646 |
+
threshold=threshold,
|
647 |
+
cp_isolation=cp_isolation,
|
648 |
+
cp_exclusion=cp_exclusion,
|
649 |
+
preemptive_behaviour=preemptive_behaviour,
|
650 |
+
explain=explain,
|
651 |
+
language_threshold=language_threshold,
|
652 |
+
enable_fallback=enable_fallback,
|
653 |
+
)
|
654 |
+
else:
|
655 |
+
guesses = from_fp(
|
656 |
+
fp_or_path_or_payload,
|
657 |
+
steps=steps,
|
658 |
+
chunk_size=chunk_size,
|
659 |
+
threshold=threshold,
|
660 |
+
cp_isolation=cp_isolation,
|
661 |
+
cp_exclusion=cp_exclusion,
|
662 |
+
preemptive_behaviour=preemptive_behaviour,
|
663 |
+
explain=explain,
|
664 |
+
language_threshold=language_threshold,
|
665 |
+
enable_fallback=enable_fallback,
|
666 |
+
)
|
667 |
+
|
668 |
+
return not guesses
|
env/Lib/site-packages/charset_normalizer/cd.py
ADDED
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import importlib
|
4 |
+
from codecs import IncrementalDecoder
|
5 |
+
from collections import Counter
|
6 |
+
from functools import lru_cache
|
7 |
+
from typing import Counter as TypeCounter
|
8 |
+
|
9 |
+
from .constant import (
|
10 |
+
FREQUENCIES,
|
11 |
+
KO_NAMES,
|
12 |
+
LANGUAGE_SUPPORTED_COUNT,
|
13 |
+
TOO_SMALL_SEQUENCE,
|
14 |
+
ZH_NAMES,
|
15 |
+
)
|
16 |
+
from .md import is_suspiciously_successive_range
|
17 |
+
from .models import CoherenceMatches
|
18 |
+
from .utils import (
|
19 |
+
is_accentuated,
|
20 |
+
is_latin,
|
21 |
+
is_multi_byte_encoding,
|
22 |
+
is_unicode_range_secondary,
|
23 |
+
unicode_range,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def encoding_unicode_range(iana_name: str) -> list[str]:
|
28 |
+
"""
|
29 |
+
Return associated unicode ranges in a single byte code page.
|
30 |
+
"""
|
31 |
+
if is_multi_byte_encoding(iana_name):
|
32 |
+
raise OSError("Function not supported on multi-byte code page")
|
33 |
+
|
34 |
+
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
35 |
+
|
36 |
+
p: IncrementalDecoder = decoder(errors="ignore")
|
37 |
+
seen_ranges: dict[str, int] = {}
|
38 |
+
character_count: int = 0
|
39 |
+
|
40 |
+
for i in range(0x40, 0xFF):
|
41 |
+
chunk: str = p.decode(bytes([i]))
|
42 |
+
|
43 |
+
if chunk:
|
44 |
+
character_range: str | None = unicode_range(chunk)
|
45 |
+
|
46 |
+
if character_range is None:
|
47 |
+
continue
|
48 |
+
|
49 |
+
if is_unicode_range_secondary(character_range) is False:
|
50 |
+
if character_range not in seen_ranges:
|
51 |
+
seen_ranges[character_range] = 0
|
52 |
+
seen_ranges[character_range] += 1
|
53 |
+
character_count += 1
|
54 |
+
|
55 |
+
return sorted(
|
56 |
+
[
|
57 |
+
character_range
|
58 |
+
for character_range in seen_ranges
|
59 |
+
if seen_ranges[character_range] / character_count >= 0.15
|
60 |
+
]
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
def unicode_range_languages(primary_range: str) -> list[str]:
|
65 |
+
"""
|
66 |
+
Return inferred languages used with a unicode range.
|
67 |
+
"""
|
68 |
+
languages: list[str] = []
|
69 |
+
|
70 |
+
for language, characters in FREQUENCIES.items():
|
71 |
+
for character in characters:
|
72 |
+
if unicode_range(character) == primary_range:
|
73 |
+
languages.append(language)
|
74 |
+
break
|
75 |
+
|
76 |
+
return languages
|
77 |
+
|
78 |
+
|
79 |
+
@lru_cache()
|
80 |
+
def encoding_languages(iana_name: str) -> list[str]:
|
81 |
+
"""
|
82 |
+
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
83 |
+
This function does the correspondence.
|
84 |
+
"""
|
85 |
+
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
86 |
+
primary_range: str | None = None
|
87 |
+
|
88 |
+
for specified_range in unicode_ranges:
|
89 |
+
if "Latin" not in specified_range:
|
90 |
+
primary_range = specified_range
|
91 |
+
break
|
92 |
+
|
93 |
+
if primary_range is None:
|
94 |
+
return ["Latin Based"]
|
95 |
+
|
96 |
+
return unicode_range_languages(primary_range)
|
97 |
+
|
98 |
+
|
99 |
+
@lru_cache()
|
100 |
+
def mb_encoding_languages(iana_name: str) -> list[str]:
|
101 |
+
"""
|
102 |
+
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
103 |
+
This function does the correspondence.
|
104 |
+
"""
|
105 |
+
if (
|
106 |
+
iana_name.startswith("shift_")
|
107 |
+
or iana_name.startswith("iso2022_jp")
|
108 |
+
or iana_name.startswith("euc_j")
|
109 |
+
or iana_name == "cp932"
|
110 |
+
):
|
111 |
+
return ["Japanese"]
|
112 |
+
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
113 |
+
return ["Chinese"]
|
114 |
+
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
115 |
+
return ["Korean"]
|
116 |
+
|
117 |
+
return []
|
118 |
+
|
119 |
+
|
120 |
+
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
121 |
+
def get_target_features(language: str) -> tuple[bool, bool]:
|
122 |
+
"""
|
123 |
+
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
124 |
+
"""
|
125 |
+
target_have_accents: bool = False
|
126 |
+
target_pure_latin: bool = True
|
127 |
+
|
128 |
+
for character in FREQUENCIES[language]:
|
129 |
+
if not target_have_accents and is_accentuated(character):
|
130 |
+
target_have_accents = True
|
131 |
+
if target_pure_latin and is_latin(character) is False:
|
132 |
+
target_pure_latin = False
|
133 |
+
|
134 |
+
return target_have_accents, target_pure_latin
|
135 |
+
|
136 |
+
|
137 |
+
def alphabet_languages(
|
138 |
+
characters: list[str], ignore_non_latin: bool = False
|
139 |
+
) -> list[str]:
|
140 |
+
"""
|
141 |
+
Return associated languages associated to given characters.
|
142 |
+
"""
|
143 |
+
languages: list[tuple[str, float]] = []
|
144 |
+
|
145 |
+
source_have_accents = any(is_accentuated(character) for character in characters)
|
146 |
+
|
147 |
+
for language, language_characters in FREQUENCIES.items():
|
148 |
+
target_have_accents, target_pure_latin = get_target_features(language)
|
149 |
+
|
150 |
+
if ignore_non_latin and target_pure_latin is False:
|
151 |
+
continue
|
152 |
+
|
153 |
+
if target_have_accents is False and source_have_accents:
|
154 |
+
continue
|
155 |
+
|
156 |
+
character_count: int = len(language_characters)
|
157 |
+
|
158 |
+
character_match_count: int = len(
|
159 |
+
[c for c in language_characters if c in characters]
|
160 |
+
)
|
161 |
+
|
162 |
+
ratio: float = character_match_count / character_count
|
163 |
+
|
164 |
+
if ratio >= 0.2:
|
165 |
+
languages.append((language, ratio))
|
166 |
+
|
167 |
+
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
168 |
+
|
169 |
+
return [compatible_language[0] for compatible_language in languages]
|
170 |
+
|
171 |
+
|
172 |
+
def characters_popularity_compare(
|
173 |
+
language: str, ordered_characters: list[str]
|
174 |
+
) -> float:
|
175 |
+
"""
|
176 |
+
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
177 |
+
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
178 |
+
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
179 |
+
"""
|
180 |
+
if language not in FREQUENCIES:
|
181 |
+
raise ValueError(f"{language} not available")
|
182 |
+
|
183 |
+
character_approved_count: int = 0
|
184 |
+
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
185 |
+
|
186 |
+
ordered_characters_count: int = len(ordered_characters)
|
187 |
+
target_language_characters_count: int = len(FREQUENCIES[language])
|
188 |
+
|
189 |
+
large_alphabet: bool = target_language_characters_count > 26
|
190 |
+
|
191 |
+
for character, character_rank in zip(
|
192 |
+
ordered_characters, range(0, ordered_characters_count)
|
193 |
+
):
|
194 |
+
if character not in FREQUENCIES_language_set:
|
195 |
+
continue
|
196 |
+
|
197 |
+
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
198 |
+
expected_projection_ratio: float = (
|
199 |
+
target_language_characters_count / ordered_characters_count
|
200 |
+
)
|
201 |
+
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
202 |
+
|
203 |
+
if (
|
204 |
+
large_alphabet is False
|
205 |
+
and abs(character_rank_projection - character_rank_in_language) > 4
|
206 |
+
):
|
207 |
+
continue
|
208 |
+
|
209 |
+
if (
|
210 |
+
large_alphabet is True
|
211 |
+
and abs(character_rank_projection - character_rank_in_language)
|
212 |
+
< target_language_characters_count / 3
|
213 |
+
):
|
214 |
+
character_approved_count += 1
|
215 |
+
continue
|
216 |
+
|
217 |
+
characters_before_source: list[str] = FREQUENCIES[language][
|
218 |
+
0:character_rank_in_language
|
219 |
+
]
|
220 |
+
characters_after_source: list[str] = FREQUENCIES[language][
|
221 |
+
character_rank_in_language:
|
222 |
+
]
|
223 |
+
characters_before: list[str] = ordered_characters[0:character_rank]
|
224 |
+
characters_after: list[str] = ordered_characters[character_rank:]
|
225 |
+
|
226 |
+
before_match_count: int = len(
|
227 |
+
set(characters_before) & set(characters_before_source)
|
228 |
+
)
|
229 |
+
|
230 |
+
after_match_count: int = len(
|
231 |
+
set(characters_after) & set(characters_after_source)
|
232 |
+
)
|
233 |
+
|
234 |
+
if len(characters_before_source) == 0 and before_match_count <= 4:
|
235 |
+
character_approved_count += 1
|
236 |
+
continue
|
237 |
+
|
238 |
+
if len(characters_after_source) == 0 and after_match_count <= 4:
|
239 |
+
character_approved_count += 1
|
240 |
+
continue
|
241 |
+
|
242 |
+
if (
|
243 |
+
before_match_count / len(characters_before_source) >= 0.4
|
244 |
+
or after_match_count / len(characters_after_source) >= 0.4
|
245 |
+
):
|
246 |
+
character_approved_count += 1
|
247 |
+
continue
|
248 |
+
|
249 |
+
return character_approved_count / len(ordered_characters)
|
250 |
+
|
251 |
+
|
252 |
+
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
253 |
+
"""
|
254 |
+
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
255 |
+
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
256 |
+
One containing the latin letters and the other hebrew.
|
257 |
+
"""
|
258 |
+
layers: dict[str, str] = {}
|
259 |
+
|
260 |
+
for character in decoded_sequence:
|
261 |
+
if character.isalpha() is False:
|
262 |
+
continue
|
263 |
+
|
264 |
+
character_range: str | None = unicode_range(character)
|
265 |
+
|
266 |
+
if character_range is None:
|
267 |
+
continue
|
268 |
+
|
269 |
+
layer_target_range: str | None = None
|
270 |
+
|
271 |
+
for discovered_range in layers:
|
272 |
+
if (
|
273 |
+
is_suspiciously_successive_range(discovered_range, character_range)
|
274 |
+
is False
|
275 |
+
):
|
276 |
+
layer_target_range = discovered_range
|
277 |
+
break
|
278 |
+
|
279 |
+
if layer_target_range is None:
|
280 |
+
layer_target_range = character_range
|
281 |
+
|
282 |
+
if layer_target_range not in layers:
|
283 |
+
layers[layer_target_range] = character.lower()
|
284 |
+
continue
|
285 |
+
|
286 |
+
layers[layer_target_range] += character.lower()
|
287 |
+
|
288 |
+
return list(layers.values())
|
289 |
+
|
290 |
+
|
291 |
+
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
292 |
+
"""
|
293 |
+
This function merge results previously given by the function coherence_ratio.
|
294 |
+
The return type is the same as coherence_ratio.
|
295 |
+
"""
|
296 |
+
per_language_ratios: dict[str, list[float]] = {}
|
297 |
+
for result in results:
|
298 |
+
for sub_result in result:
|
299 |
+
language, ratio = sub_result
|
300 |
+
if language not in per_language_ratios:
|
301 |
+
per_language_ratios[language] = [ratio]
|
302 |
+
continue
|
303 |
+
per_language_ratios[language].append(ratio)
|
304 |
+
|
305 |
+
merge = [
|
306 |
+
(
|
307 |
+
language,
|
308 |
+
round(
|
309 |
+
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
310 |
+
4,
|
311 |
+
),
|
312 |
+
)
|
313 |
+
for language in per_language_ratios
|
314 |
+
]
|
315 |
+
|
316 |
+
return sorted(merge, key=lambda x: x[1], reverse=True)
|
317 |
+
|
318 |
+
|
319 |
+
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
320 |
+
"""
|
321 |
+
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
322 |
+
of "English". This function only keeps the best match and remove the em-dash in it.
|
323 |
+
"""
|
324 |
+
index_results: dict[str, list[float]] = dict()
|
325 |
+
|
326 |
+
for result in results:
|
327 |
+
language, ratio = result
|
328 |
+
no_em_name: str = language.replace("—", "")
|
329 |
+
|
330 |
+
if no_em_name not in index_results:
|
331 |
+
index_results[no_em_name] = []
|
332 |
+
|
333 |
+
index_results[no_em_name].append(ratio)
|
334 |
+
|
335 |
+
if any(len(index_results[e]) > 1 for e in index_results):
|
336 |
+
filtered_results: CoherenceMatches = []
|
337 |
+
|
338 |
+
for language in index_results:
|
339 |
+
filtered_results.append((language, max(index_results[language])))
|
340 |
+
|
341 |
+
return filtered_results
|
342 |
+
|
343 |
+
return results
|
344 |
+
|
345 |
+
|
346 |
+
@lru_cache(maxsize=2048)
|
347 |
+
def coherence_ratio(
|
348 |
+
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
349 |
+
) -> CoherenceMatches:
|
350 |
+
"""
|
351 |
+
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
352 |
+
A layer = Character extraction by alphabets/ranges.
|
353 |
+
"""
|
354 |
+
|
355 |
+
results: list[tuple[str, float]] = []
|
356 |
+
ignore_non_latin: bool = False
|
357 |
+
|
358 |
+
sufficient_match_count: int = 0
|
359 |
+
|
360 |
+
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
361 |
+
if "Latin Based" in lg_inclusion_list:
|
362 |
+
ignore_non_latin = True
|
363 |
+
lg_inclusion_list.remove("Latin Based")
|
364 |
+
|
365 |
+
for layer in alpha_unicode_split(decoded_sequence):
|
366 |
+
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
367 |
+
most_common = sequence_frequencies.most_common()
|
368 |
+
|
369 |
+
character_count: int = sum(o for c, o in most_common)
|
370 |
+
|
371 |
+
if character_count <= TOO_SMALL_SEQUENCE:
|
372 |
+
continue
|
373 |
+
|
374 |
+
popular_character_ordered: list[str] = [c for c, o in most_common]
|
375 |
+
|
376 |
+
for language in lg_inclusion_list or alphabet_languages(
|
377 |
+
popular_character_ordered, ignore_non_latin
|
378 |
+
):
|
379 |
+
ratio: float = characters_popularity_compare(
|
380 |
+
language, popular_character_ordered
|
381 |
+
)
|
382 |
+
|
383 |
+
if ratio < threshold:
|
384 |
+
continue
|
385 |
+
elif ratio >= 0.8:
|
386 |
+
sufficient_match_count += 1
|
387 |
+
|
388 |
+
results.append((language, round(ratio, 4)))
|
389 |
+
|
390 |
+
if sufficient_match_count >= 3:
|
391 |
+
break
|
392 |
+
|
393 |
+
return sorted(
|
394 |
+
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
395 |
+
)
|
env/Lib/site-packages/charset_normalizer/cli/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from .__main__ import cli_detect, query_yes_no
|
4 |
+
|
5 |
+
__all__ = (
|
6 |
+
"cli_detect",
|
7 |
+
"query_yes_no",
|
8 |
+
)
|
env/Lib/site-packages/charset_normalizer/cli/__main__.py
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import sys
|
5 |
+
from json import dumps
|
6 |
+
from os.path import abspath, basename, dirname, join, realpath
|
7 |
+
from platform import python_version
|
8 |
+
from unicodedata import unidata_version
|
9 |
+
|
10 |
+
import charset_normalizer.md as md_module
|
11 |
+
from charset_normalizer import from_fp
|
12 |
+
from charset_normalizer.models import CliDetectionResult
|
13 |
+
from charset_normalizer.version import __version__
|
14 |
+
|
15 |
+
|
16 |
+
def query_yes_no(question: str, default: str = "yes") -> bool:
|
17 |
+
"""Ask a yes/no question via input() and return their answer.
|
18 |
+
|
19 |
+
"question" is a string that is presented to the user.
|
20 |
+
"default" is the presumed answer if the user just hits <Enter>.
|
21 |
+
It must be "yes" (the default), "no" or None (meaning
|
22 |
+
an answer is required of the user).
|
23 |
+
|
24 |
+
The "answer" return value is True for "yes" or False for "no".
|
25 |
+
|
26 |
+
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
27 |
+
"""
|
28 |
+
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
29 |
+
if default is None:
|
30 |
+
prompt = " [y/n] "
|
31 |
+
elif default == "yes":
|
32 |
+
prompt = " [Y/n] "
|
33 |
+
elif default == "no":
|
34 |
+
prompt = " [y/N] "
|
35 |
+
else:
|
36 |
+
raise ValueError("invalid default answer: '%s'" % default)
|
37 |
+
|
38 |
+
while True:
|
39 |
+
sys.stdout.write(question + prompt)
|
40 |
+
choice = input().lower()
|
41 |
+
if default is not None and choice == "":
|
42 |
+
return valid[default]
|
43 |
+
elif choice in valid:
|
44 |
+
return valid[choice]
|
45 |
+
else:
|
46 |
+
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
47 |
+
|
48 |
+
|
49 |
+
def cli_detect(argv: list[str] | None = None) -> int:
|
50 |
+
"""
|
51 |
+
CLI assistant using ARGV and ArgumentParser
|
52 |
+
:param argv:
|
53 |
+
:return: 0 if everything is fine, anything else equal trouble
|
54 |
+
"""
|
55 |
+
parser = argparse.ArgumentParser(
|
56 |
+
description="The Real First Universal Charset Detector. "
|
57 |
+
"Discover originating encoding used on text file. "
|
58 |
+
"Normalize text to unicode."
|
59 |
+
)
|
60 |
+
|
61 |
+
parser.add_argument(
|
62 |
+
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"-v",
|
66 |
+
"--verbose",
|
67 |
+
action="store_true",
|
68 |
+
default=False,
|
69 |
+
dest="verbose",
|
70 |
+
help="Display complementary information about file if any. "
|
71 |
+
"Stdout will contain logs about the detection process.",
|
72 |
+
)
|
73 |
+
parser.add_argument(
|
74 |
+
"-a",
|
75 |
+
"--with-alternative",
|
76 |
+
action="store_true",
|
77 |
+
default=False,
|
78 |
+
dest="alternatives",
|
79 |
+
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
80 |
+
)
|
81 |
+
parser.add_argument(
|
82 |
+
"-n",
|
83 |
+
"--normalize",
|
84 |
+
action="store_true",
|
85 |
+
default=False,
|
86 |
+
dest="normalize",
|
87 |
+
help="Permit to normalize input file. If not set, program does not write anything.",
|
88 |
+
)
|
89 |
+
parser.add_argument(
|
90 |
+
"-m",
|
91 |
+
"--minimal",
|
92 |
+
action="store_true",
|
93 |
+
default=False,
|
94 |
+
dest="minimal",
|
95 |
+
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
96 |
+
)
|
97 |
+
parser.add_argument(
|
98 |
+
"-r",
|
99 |
+
"--replace",
|
100 |
+
action="store_true",
|
101 |
+
default=False,
|
102 |
+
dest="replace",
|
103 |
+
help="Replace file when trying to normalize it instead of creating a new one.",
|
104 |
+
)
|
105 |
+
parser.add_argument(
|
106 |
+
"-f",
|
107 |
+
"--force",
|
108 |
+
action="store_true",
|
109 |
+
default=False,
|
110 |
+
dest="force",
|
111 |
+
help="Replace file without asking if you are sure, use this flag with caution.",
|
112 |
+
)
|
113 |
+
parser.add_argument(
|
114 |
+
"-i",
|
115 |
+
"--no-preemptive",
|
116 |
+
action="store_true",
|
117 |
+
default=False,
|
118 |
+
dest="no_preemptive",
|
119 |
+
help="Disable looking at a charset declaration to hint the detector.",
|
120 |
+
)
|
121 |
+
parser.add_argument(
|
122 |
+
"-t",
|
123 |
+
"--threshold",
|
124 |
+
action="store",
|
125 |
+
default=0.2,
|
126 |
+
type=float,
|
127 |
+
dest="threshold",
|
128 |
+
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
129 |
+
)
|
130 |
+
parser.add_argument(
|
131 |
+
"--version",
|
132 |
+
action="version",
|
133 |
+
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
134 |
+
__version__,
|
135 |
+
python_version(),
|
136 |
+
unidata_version,
|
137 |
+
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
138 |
+
),
|
139 |
+
help="Show version information and exit.",
|
140 |
+
)
|
141 |
+
|
142 |
+
args = parser.parse_args(argv)
|
143 |
+
|
144 |
+
if args.replace is True and args.normalize is False:
|
145 |
+
if args.files:
|
146 |
+
for my_file in args.files:
|
147 |
+
my_file.close()
|
148 |
+
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
149 |
+
return 1
|
150 |
+
|
151 |
+
if args.force is True and args.replace is False:
|
152 |
+
if args.files:
|
153 |
+
for my_file in args.files:
|
154 |
+
my_file.close()
|
155 |
+
print("Use --force in addition of --replace only.", file=sys.stderr)
|
156 |
+
return 1
|
157 |
+
|
158 |
+
if args.threshold < 0.0 or args.threshold > 1.0:
|
159 |
+
if args.files:
|
160 |
+
for my_file in args.files:
|
161 |
+
my_file.close()
|
162 |
+
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
163 |
+
return 1
|
164 |
+
|
165 |
+
x_ = []
|
166 |
+
|
167 |
+
for my_file in args.files:
|
168 |
+
matches = from_fp(
|
169 |
+
my_file,
|
170 |
+
threshold=args.threshold,
|
171 |
+
explain=args.verbose,
|
172 |
+
preemptive_behaviour=args.no_preemptive is False,
|
173 |
+
)
|
174 |
+
|
175 |
+
best_guess = matches.best()
|
176 |
+
|
177 |
+
if best_guess is None:
|
178 |
+
print(
|
179 |
+
'Unable to identify originating encoding for "{}". {}'.format(
|
180 |
+
my_file.name,
|
181 |
+
(
|
182 |
+
"Maybe try increasing maximum amount of chaos."
|
183 |
+
if args.threshold < 1.0
|
184 |
+
else ""
|
185 |
+
),
|
186 |
+
),
|
187 |
+
file=sys.stderr,
|
188 |
+
)
|
189 |
+
x_.append(
|
190 |
+
CliDetectionResult(
|
191 |
+
abspath(my_file.name),
|
192 |
+
None,
|
193 |
+
[],
|
194 |
+
[],
|
195 |
+
"Unknown",
|
196 |
+
[],
|
197 |
+
False,
|
198 |
+
1.0,
|
199 |
+
0.0,
|
200 |
+
None,
|
201 |
+
True,
|
202 |
+
)
|
203 |
+
)
|
204 |
+
else:
|
205 |
+
x_.append(
|
206 |
+
CliDetectionResult(
|
207 |
+
abspath(my_file.name),
|
208 |
+
best_guess.encoding,
|
209 |
+
best_guess.encoding_aliases,
|
210 |
+
[
|
211 |
+
cp
|
212 |
+
for cp in best_guess.could_be_from_charset
|
213 |
+
if cp != best_guess.encoding
|
214 |
+
],
|
215 |
+
best_guess.language,
|
216 |
+
best_guess.alphabets,
|
217 |
+
best_guess.bom,
|
218 |
+
best_guess.percent_chaos,
|
219 |
+
best_guess.percent_coherence,
|
220 |
+
None,
|
221 |
+
True,
|
222 |
+
)
|
223 |
+
)
|
224 |
+
|
225 |
+
if len(matches) > 1 and args.alternatives:
|
226 |
+
for el in matches:
|
227 |
+
if el != best_guess:
|
228 |
+
x_.append(
|
229 |
+
CliDetectionResult(
|
230 |
+
abspath(my_file.name),
|
231 |
+
el.encoding,
|
232 |
+
el.encoding_aliases,
|
233 |
+
[
|
234 |
+
cp
|
235 |
+
for cp in el.could_be_from_charset
|
236 |
+
if cp != el.encoding
|
237 |
+
],
|
238 |
+
el.language,
|
239 |
+
el.alphabets,
|
240 |
+
el.bom,
|
241 |
+
el.percent_chaos,
|
242 |
+
el.percent_coherence,
|
243 |
+
None,
|
244 |
+
False,
|
245 |
+
)
|
246 |
+
)
|
247 |
+
|
248 |
+
if args.normalize is True:
|
249 |
+
if best_guess.encoding.startswith("utf") is True:
|
250 |
+
print(
|
251 |
+
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
252 |
+
my_file.name
|
253 |
+
),
|
254 |
+
file=sys.stderr,
|
255 |
+
)
|
256 |
+
if my_file.closed is False:
|
257 |
+
my_file.close()
|
258 |
+
continue
|
259 |
+
|
260 |
+
dir_path = dirname(realpath(my_file.name))
|
261 |
+
file_name = basename(realpath(my_file.name))
|
262 |
+
|
263 |
+
o_: list[str] = file_name.split(".")
|
264 |
+
|
265 |
+
if args.replace is False:
|
266 |
+
o_.insert(-1, best_guess.encoding)
|
267 |
+
if my_file.closed is False:
|
268 |
+
my_file.close()
|
269 |
+
elif (
|
270 |
+
args.force is False
|
271 |
+
and query_yes_no(
|
272 |
+
'Are you sure to normalize "{}" by replacing it ?'.format(
|
273 |
+
my_file.name
|
274 |
+
),
|
275 |
+
"no",
|
276 |
+
)
|
277 |
+
is False
|
278 |
+
):
|
279 |
+
if my_file.closed is False:
|
280 |
+
my_file.close()
|
281 |
+
continue
|
282 |
+
|
283 |
+
try:
|
284 |
+
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
285 |
+
|
286 |
+
with open(x_[0].unicode_path, "wb") as fp:
|
287 |
+
fp.write(best_guess.output())
|
288 |
+
except OSError as e:
|
289 |
+
print(str(e), file=sys.stderr)
|
290 |
+
if my_file.closed is False:
|
291 |
+
my_file.close()
|
292 |
+
return 2
|
293 |
+
|
294 |
+
if my_file.closed is False:
|
295 |
+
my_file.close()
|
296 |
+
|
297 |
+
if args.minimal is False:
|
298 |
+
print(
|
299 |
+
dumps(
|
300 |
+
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
301 |
+
ensure_ascii=True,
|
302 |
+
indent=4,
|
303 |
+
)
|
304 |
+
)
|
305 |
+
else:
|
306 |
+
for my_file in args.files:
|
307 |
+
print(
|
308 |
+
", ".join(
|
309 |
+
[
|
310 |
+
el.encoding or "undefined"
|
311 |
+
for el in x_
|
312 |
+
if el.path == abspath(my_file.name)
|
313 |
+
]
|
314 |
+
)
|
315 |
+
)
|
316 |
+
|
317 |
+
return 0
|
318 |
+
|
319 |
+
|
320 |
+
if __name__ == "__main__":
|
321 |
+
cli_detect()
|
env/Lib/site-packages/charset_normalizer/constant.py
ADDED
@@ -0,0 +1,1998 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
4 |
+
from encodings.aliases import aliases
|
5 |
+
from re import IGNORECASE
|
6 |
+
from re import compile as re_compile
|
7 |
+
|
8 |
+
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
9 |
+
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
|
10 |
+
"utf_8": BOM_UTF8,
|
11 |
+
"utf_7": [
|
12 |
+
b"\x2b\x2f\x76\x38",
|
13 |
+
b"\x2b\x2f\x76\x39",
|
14 |
+
b"\x2b\x2f\x76\x2b",
|
15 |
+
b"\x2b\x2f\x76\x2f",
|
16 |
+
b"\x2b\x2f\x76\x38\x2d",
|
17 |
+
],
|
18 |
+
"gb18030": b"\x84\x31\x95\x33",
|
19 |
+
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
20 |
+
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
21 |
+
}
|
22 |
+
|
23 |
+
TOO_SMALL_SEQUENCE: int = 32
|
24 |
+
TOO_BIG_SEQUENCE: int = int(10e6)
|
25 |
+
|
26 |
+
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
27 |
+
|
28 |
+
# Up-to-date Unicode ucd/15.0.0
|
29 |
+
UNICODE_RANGES_COMBINED: dict[str, range] = {
|
30 |
+
"Control character": range(32),
|
31 |
+
"Basic Latin": range(32, 128),
|
32 |
+
"Latin-1 Supplement": range(128, 256),
|
33 |
+
"Latin Extended-A": range(256, 384),
|
34 |
+
"Latin Extended-B": range(384, 592),
|
35 |
+
"IPA Extensions": range(592, 688),
|
36 |
+
"Spacing Modifier Letters": range(688, 768),
|
37 |
+
"Combining Diacritical Marks": range(768, 880),
|
38 |
+
"Greek and Coptic": range(880, 1024),
|
39 |
+
"Cyrillic": range(1024, 1280),
|
40 |
+
"Cyrillic Supplement": range(1280, 1328),
|
41 |
+
"Armenian": range(1328, 1424),
|
42 |
+
"Hebrew": range(1424, 1536),
|
43 |
+
"Arabic": range(1536, 1792),
|
44 |
+
"Syriac": range(1792, 1872),
|
45 |
+
"Arabic Supplement": range(1872, 1920),
|
46 |
+
"Thaana": range(1920, 1984),
|
47 |
+
"NKo": range(1984, 2048),
|
48 |
+
"Samaritan": range(2048, 2112),
|
49 |
+
"Mandaic": range(2112, 2144),
|
50 |
+
"Syriac Supplement": range(2144, 2160),
|
51 |
+
"Arabic Extended-B": range(2160, 2208),
|
52 |
+
"Arabic Extended-A": range(2208, 2304),
|
53 |
+
"Devanagari": range(2304, 2432),
|
54 |
+
"Bengali": range(2432, 2560),
|
55 |
+
"Gurmukhi": range(2560, 2688),
|
56 |
+
"Gujarati": range(2688, 2816),
|
57 |
+
"Oriya": range(2816, 2944),
|
58 |
+
"Tamil": range(2944, 3072),
|
59 |
+
"Telugu": range(3072, 3200),
|
60 |
+
"Kannada": range(3200, 3328),
|
61 |
+
"Malayalam": range(3328, 3456),
|
62 |
+
"Sinhala": range(3456, 3584),
|
63 |
+
"Thai": range(3584, 3712),
|
64 |
+
"Lao": range(3712, 3840),
|
65 |
+
"Tibetan": range(3840, 4096),
|
66 |
+
"Myanmar": range(4096, 4256),
|
67 |
+
"Georgian": range(4256, 4352),
|
68 |
+
"Hangul Jamo": range(4352, 4608),
|
69 |
+
"Ethiopic": range(4608, 4992),
|
70 |
+
"Ethiopic Supplement": range(4992, 5024),
|
71 |
+
"Cherokee": range(5024, 5120),
|
72 |
+
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
|
73 |
+
"Ogham": range(5760, 5792),
|
74 |
+
"Runic": range(5792, 5888),
|
75 |
+
"Tagalog": range(5888, 5920),
|
76 |
+
"Hanunoo": range(5920, 5952),
|
77 |
+
"Buhid": range(5952, 5984),
|
78 |
+
"Tagbanwa": range(5984, 6016),
|
79 |
+
"Khmer": range(6016, 6144),
|
80 |
+
"Mongolian": range(6144, 6320),
|
81 |
+
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
|
82 |
+
"Limbu": range(6400, 6480),
|
83 |
+
"Tai Le": range(6480, 6528),
|
84 |
+
"New Tai Lue": range(6528, 6624),
|
85 |
+
"Khmer Symbols": range(6624, 6656),
|
86 |
+
"Buginese": range(6656, 6688),
|
87 |
+
"Tai Tham": range(6688, 6832),
|
88 |
+
"Combining Diacritical Marks Extended": range(6832, 6912),
|
89 |
+
"Balinese": range(6912, 7040),
|
90 |
+
"Sundanese": range(7040, 7104),
|
91 |
+
"Batak": range(7104, 7168),
|
92 |
+
"Lepcha": range(7168, 7248),
|
93 |
+
"Ol Chiki": range(7248, 7296),
|
94 |
+
"Cyrillic Extended-C": range(7296, 7312),
|
95 |
+
"Georgian Extended": range(7312, 7360),
|
96 |
+
"Sundanese Supplement": range(7360, 7376),
|
97 |
+
"Vedic Extensions": range(7376, 7424),
|
98 |
+
"Phonetic Extensions": range(7424, 7552),
|
99 |
+
"Phonetic Extensions Supplement": range(7552, 7616),
|
100 |
+
"Combining Diacritical Marks Supplement": range(7616, 7680),
|
101 |
+
"Latin Extended Additional": range(7680, 7936),
|
102 |
+
"Greek Extended": range(7936, 8192),
|
103 |
+
"General Punctuation": range(8192, 8304),
|
104 |
+
"Superscripts and Subscripts": range(8304, 8352),
|
105 |
+
"Currency Symbols": range(8352, 8400),
|
106 |
+
"Combining Diacritical Marks for Symbols": range(8400, 8448),
|
107 |
+
"Letterlike Symbols": range(8448, 8528),
|
108 |
+
"Number Forms": range(8528, 8592),
|
109 |
+
"Arrows": range(8592, 8704),
|
110 |
+
"Mathematical Operators": range(8704, 8960),
|
111 |
+
"Miscellaneous Technical": range(8960, 9216),
|
112 |
+
"Control Pictures": range(9216, 9280),
|
113 |
+
"Optical Character Recognition": range(9280, 9312),
|
114 |
+
"Enclosed Alphanumerics": range(9312, 9472),
|
115 |
+
"Box Drawing": range(9472, 9600),
|
116 |
+
"Block Elements": range(9600, 9632),
|
117 |
+
"Geometric Shapes": range(9632, 9728),
|
118 |
+
"Miscellaneous Symbols": range(9728, 9984),
|
119 |
+
"Dingbats": range(9984, 10176),
|
120 |
+
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
|
121 |
+
"Supplemental Arrows-A": range(10224, 10240),
|
122 |
+
"Braille Patterns": range(10240, 10496),
|
123 |
+
"Supplemental Arrows-B": range(10496, 10624),
|
124 |
+
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
|
125 |
+
"Supplemental Mathematical Operators": range(10752, 11008),
|
126 |
+
"Miscellaneous Symbols and Arrows": range(11008, 11264),
|
127 |
+
"Glagolitic": range(11264, 11360),
|
128 |
+
"Latin Extended-C": range(11360, 11392),
|
129 |
+
"Coptic": range(11392, 11520),
|
130 |
+
"Georgian Supplement": range(11520, 11568),
|
131 |
+
"Tifinagh": range(11568, 11648),
|
132 |
+
"Ethiopic Extended": range(11648, 11744),
|
133 |
+
"Cyrillic Extended-A": range(11744, 11776),
|
134 |
+
"Supplemental Punctuation": range(11776, 11904),
|
135 |
+
"CJK Radicals Supplement": range(11904, 12032),
|
136 |
+
"Kangxi Radicals": range(12032, 12256),
|
137 |
+
"Ideographic Description Characters": range(12272, 12288),
|
138 |
+
"CJK Symbols and Punctuation": range(12288, 12352),
|
139 |
+
"Hiragana": range(12352, 12448),
|
140 |
+
"Katakana": range(12448, 12544),
|
141 |
+
"Bopomofo": range(12544, 12592),
|
142 |
+
"Hangul Compatibility Jamo": range(12592, 12688),
|
143 |
+
"Kanbun": range(12688, 12704),
|
144 |
+
"Bopomofo Extended": range(12704, 12736),
|
145 |
+
"CJK Strokes": range(12736, 12784),
|
146 |
+
"Katakana Phonetic Extensions": range(12784, 12800),
|
147 |
+
"Enclosed CJK Letters and Months": range(12800, 13056),
|
148 |
+
"CJK Compatibility": range(13056, 13312),
|
149 |
+
"CJK Unified Ideographs Extension A": range(13312, 19904),
|
150 |
+
"Yijing Hexagram Symbols": range(19904, 19968),
|
151 |
+
"CJK Unified Ideographs": range(19968, 40960),
|
152 |
+
"Yi Syllables": range(40960, 42128),
|
153 |
+
"Yi Radicals": range(42128, 42192),
|
154 |
+
"Lisu": range(42192, 42240),
|
155 |
+
"Vai": range(42240, 42560),
|
156 |
+
"Cyrillic Extended-B": range(42560, 42656),
|
157 |
+
"Bamum": range(42656, 42752),
|
158 |
+
"Modifier Tone Letters": range(42752, 42784),
|
159 |
+
"Latin Extended-D": range(42784, 43008),
|
160 |
+
"Syloti Nagri": range(43008, 43056),
|
161 |
+
"Common Indic Number Forms": range(43056, 43072),
|
162 |
+
"Phags-pa": range(43072, 43136),
|
163 |
+
"Saurashtra": range(43136, 43232),
|
164 |
+
"Devanagari Extended": range(43232, 43264),
|
165 |
+
"Kayah Li": range(43264, 43312),
|
166 |
+
"Rejang": range(43312, 43360),
|
167 |
+
"Hangul Jamo Extended-A": range(43360, 43392),
|
168 |
+
"Javanese": range(43392, 43488),
|
169 |
+
"Myanmar Extended-B": range(43488, 43520),
|
170 |
+
"Cham": range(43520, 43616),
|
171 |
+
"Myanmar Extended-A": range(43616, 43648),
|
172 |
+
"Tai Viet": range(43648, 43744),
|
173 |
+
"Meetei Mayek Extensions": range(43744, 43776),
|
174 |
+
"Ethiopic Extended-A": range(43776, 43824),
|
175 |
+
"Latin Extended-E": range(43824, 43888),
|
176 |
+
"Cherokee Supplement": range(43888, 43968),
|
177 |
+
"Meetei Mayek": range(43968, 44032),
|
178 |
+
"Hangul Syllables": range(44032, 55216),
|
179 |
+
"Hangul Jamo Extended-B": range(55216, 55296),
|
180 |
+
"High Surrogates": range(55296, 56192),
|
181 |
+
"High Private Use Surrogates": range(56192, 56320),
|
182 |
+
"Low Surrogates": range(56320, 57344),
|
183 |
+
"Private Use Area": range(57344, 63744),
|
184 |
+
"CJK Compatibility Ideographs": range(63744, 64256),
|
185 |
+
"Alphabetic Presentation Forms": range(64256, 64336),
|
186 |
+
"Arabic Presentation Forms-A": range(64336, 65024),
|
187 |
+
"Variation Selectors": range(65024, 65040),
|
188 |
+
"Vertical Forms": range(65040, 65056),
|
189 |
+
"Combining Half Marks": range(65056, 65072),
|
190 |
+
"CJK Compatibility Forms": range(65072, 65104),
|
191 |
+
"Small Form Variants": range(65104, 65136),
|
192 |
+
"Arabic Presentation Forms-B": range(65136, 65280),
|
193 |
+
"Halfwidth and Fullwidth Forms": range(65280, 65520),
|
194 |
+
"Specials": range(65520, 65536),
|
195 |
+
"Linear B Syllabary": range(65536, 65664),
|
196 |
+
"Linear B Ideograms": range(65664, 65792),
|
197 |
+
"Aegean Numbers": range(65792, 65856),
|
198 |
+
"Ancient Greek Numbers": range(65856, 65936),
|
199 |
+
"Ancient Symbols": range(65936, 66000),
|
200 |
+
"Phaistos Disc": range(66000, 66048),
|
201 |
+
"Lycian": range(66176, 66208),
|
202 |
+
"Carian": range(66208, 66272),
|
203 |
+
"Coptic Epact Numbers": range(66272, 66304),
|
204 |
+
"Old Italic": range(66304, 66352),
|
205 |
+
"Gothic": range(66352, 66384),
|
206 |
+
"Old Permic": range(66384, 66432),
|
207 |
+
"Ugaritic": range(66432, 66464),
|
208 |
+
"Old Persian": range(66464, 66528),
|
209 |
+
"Deseret": range(66560, 66640),
|
210 |
+
"Shavian": range(66640, 66688),
|
211 |
+
"Osmanya": range(66688, 66736),
|
212 |
+
"Osage": range(66736, 66816),
|
213 |
+
"Elbasan": range(66816, 66864),
|
214 |
+
"Caucasian Albanian": range(66864, 66928),
|
215 |
+
"Vithkuqi": range(66928, 67008),
|
216 |
+
"Linear A": range(67072, 67456),
|
217 |
+
"Latin Extended-F": range(67456, 67520),
|
218 |
+
"Cypriot Syllabary": range(67584, 67648),
|
219 |
+
"Imperial Aramaic": range(67648, 67680),
|
220 |
+
"Palmyrene": range(67680, 67712),
|
221 |
+
"Nabataean": range(67712, 67760),
|
222 |
+
"Hatran": range(67808, 67840),
|
223 |
+
"Phoenician": range(67840, 67872),
|
224 |
+
"Lydian": range(67872, 67904),
|
225 |
+
"Meroitic Hieroglyphs": range(67968, 68000),
|
226 |
+
"Meroitic Cursive": range(68000, 68096),
|
227 |
+
"Kharoshthi": range(68096, 68192),
|
228 |
+
"Old South Arabian": range(68192, 68224),
|
229 |
+
"Old North Arabian": range(68224, 68256),
|
230 |
+
"Manichaean": range(68288, 68352),
|
231 |
+
"Avestan": range(68352, 68416),
|
232 |
+
"Inscriptional Parthian": range(68416, 68448),
|
233 |
+
"Inscriptional Pahlavi": range(68448, 68480),
|
234 |
+
"Psalter Pahlavi": range(68480, 68528),
|
235 |
+
"Old Turkic": range(68608, 68688),
|
236 |
+
"Old Hungarian": range(68736, 68864),
|
237 |
+
"Hanifi Rohingya": range(68864, 68928),
|
238 |
+
"Rumi Numeral Symbols": range(69216, 69248),
|
239 |
+
"Yezidi": range(69248, 69312),
|
240 |
+
"Arabic Extended-C": range(69312, 69376),
|
241 |
+
"Old Sogdian": range(69376, 69424),
|
242 |
+
"Sogdian": range(69424, 69488),
|
243 |
+
"Old Uyghur": range(69488, 69552),
|
244 |
+
"Chorasmian": range(69552, 69600),
|
245 |
+
"Elymaic": range(69600, 69632),
|
246 |
+
"Brahmi": range(69632, 69760),
|
247 |
+
"Kaithi": range(69760, 69840),
|
248 |
+
"Sora Sompeng": range(69840, 69888),
|
249 |
+
"Chakma": range(69888, 69968),
|
250 |
+
"Mahajani": range(69968, 70016),
|
251 |
+
"Sharada": range(70016, 70112),
|
252 |
+
"Sinhala Archaic Numbers": range(70112, 70144),
|
253 |
+
"Khojki": range(70144, 70224),
|
254 |
+
"Multani": range(70272, 70320),
|
255 |
+
"Khudawadi": range(70320, 70400),
|
256 |
+
"Grantha": range(70400, 70528),
|
257 |
+
"Newa": range(70656, 70784),
|
258 |
+
"Tirhuta": range(70784, 70880),
|
259 |
+
"Siddham": range(71040, 71168),
|
260 |
+
"Modi": range(71168, 71264),
|
261 |
+
"Mongolian Supplement": range(71264, 71296),
|
262 |
+
"Takri": range(71296, 71376),
|
263 |
+
"Ahom": range(71424, 71504),
|
264 |
+
"Dogra": range(71680, 71760),
|
265 |
+
"Warang Citi": range(71840, 71936),
|
266 |
+
"Dives Akuru": range(71936, 72032),
|
267 |
+
"Nandinagari": range(72096, 72192),
|
268 |
+
"Zanabazar Square": range(72192, 72272),
|
269 |
+
"Soyombo": range(72272, 72368),
|
270 |
+
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
|
271 |
+
"Pau Cin Hau": range(72384, 72448),
|
272 |
+
"Devanagari Extended-A": range(72448, 72544),
|
273 |
+
"Bhaiksuki": range(72704, 72816),
|
274 |
+
"Marchen": range(72816, 72896),
|
275 |
+
"Masaram Gondi": range(72960, 73056),
|
276 |
+
"Gunjala Gondi": range(73056, 73136),
|
277 |
+
"Makasar": range(73440, 73472),
|
278 |
+
"Kawi": range(73472, 73568),
|
279 |
+
"Lisu Supplement": range(73648, 73664),
|
280 |
+
"Tamil Supplement": range(73664, 73728),
|
281 |
+
"Cuneiform": range(73728, 74752),
|
282 |
+
"Cuneiform Numbers and Punctuation": range(74752, 74880),
|
283 |
+
"Early Dynastic Cuneiform": range(74880, 75088),
|
284 |
+
"Cypro-Minoan": range(77712, 77824),
|
285 |
+
"Egyptian Hieroglyphs": range(77824, 78896),
|
286 |
+
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
|
287 |
+
"Anatolian Hieroglyphs": range(82944, 83584),
|
288 |
+
"Bamum Supplement": range(92160, 92736),
|
289 |
+
"Mro": range(92736, 92784),
|
290 |
+
"Tangsa": range(92784, 92880),
|
291 |
+
"Bassa Vah": range(92880, 92928),
|
292 |
+
"Pahawh Hmong": range(92928, 93072),
|
293 |
+
"Medefaidrin": range(93760, 93856),
|
294 |
+
"Miao": range(93952, 94112),
|
295 |
+
"Ideographic Symbols and Punctuation": range(94176, 94208),
|
296 |
+
"Tangut": range(94208, 100352),
|
297 |
+
"Tangut Components": range(100352, 101120),
|
298 |
+
"Khitan Small Script": range(101120, 101632),
|
299 |
+
"Tangut Supplement": range(101632, 101760),
|
300 |
+
"Kana Extended-B": range(110576, 110592),
|
301 |
+
"Kana Supplement": range(110592, 110848),
|
302 |
+
"Kana Extended-A": range(110848, 110896),
|
303 |
+
"Small Kana Extension": range(110896, 110960),
|
304 |
+
"Nushu": range(110960, 111360),
|
305 |
+
"Duployan": range(113664, 113824),
|
306 |
+
"Shorthand Format Controls": range(113824, 113840),
|
307 |
+
"Znamenny Musical Notation": range(118528, 118736),
|
308 |
+
"Byzantine Musical Symbols": range(118784, 119040),
|
309 |
+
"Musical Symbols": range(119040, 119296),
|
310 |
+
"Ancient Greek Musical Notation": range(119296, 119376),
|
311 |
+
"Kaktovik Numerals": range(119488, 119520),
|
312 |
+
"Mayan Numerals": range(119520, 119552),
|
313 |
+
"Tai Xuan Jing Symbols": range(119552, 119648),
|
314 |
+
"Counting Rod Numerals": range(119648, 119680),
|
315 |
+
"Mathematical Alphanumeric Symbols": range(119808, 120832),
|
316 |
+
"Sutton SignWriting": range(120832, 121520),
|
317 |
+
"Latin Extended-G": range(122624, 122880),
|
318 |
+
"Glagolitic Supplement": range(122880, 122928),
|
319 |
+
"Cyrillic Extended-D": range(122928, 123024),
|
320 |
+
"Nyiakeng Puachue Hmong": range(123136, 123216),
|
321 |
+
"Toto": range(123536, 123584),
|
322 |
+
"Wancho": range(123584, 123648),
|
323 |
+
"Nag Mundari": range(124112, 124160),
|
324 |
+
"Ethiopic Extended-B": range(124896, 124928),
|
325 |
+
"Mende Kikakui": range(124928, 125152),
|
326 |
+
"Adlam": range(125184, 125280),
|
327 |
+
"Indic Siyaq Numbers": range(126064, 126144),
|
328 |
+
"Ottoman Siyaq Numbers": range(126208, 126288),
|
329 |
+
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
|
330 |
+
"Mahjong Tiles": range(126976, 127024),
|
331 |
+
"Domino Tiles": range(127024, 127136),
|
332 |
+
"Playing Cards": range(127136, 127232),
|
333 |
+
"Enclosed Alphanumeric Supplement": range(127232, 127488),
|
334 |
+
"Enclosed Ideographic Supplement": range(127488, 127744),
|
335 |
+
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
|
336 |
+
"Emoticons range(Emoji)": range(128512, 128592),
|
337 |
+
"Ornamental Dingbats": range(128592, 128640),
|
338 |
+
"Transport and Map Symbols": range(128640, 128768),
|
339 |
+
"Alchemical Symbols": range(128768, 128896),
|
340 |
+
"Geometric Shapes Extended": range(128896, 129024),
|
341 |
+
"Supplemental Arrows-C": range(129024, 129280),
|
342 |
+
"Supplemental Symbols and Pictographs": range(129280, 129536),
|
343 |
+
"Chess Symbols": range(129536, 129648),
|
344 |
+
"Symbols and Pictographs Extended-A": range(129648, 129792),
|
345 |
+
"Symbols for Legacy Computing": range(129792, 130048),
|
346 |
+
"CJK Unified Ideographs Extension B": range(131072, 173792),
|
347 |
+
"CJK Unified Ideographs Extension C": range(173824, 177984),
|
348 |
+
"CJK Unified Ideographs Extension D": range(177984, 178208),
|
349 |
+
"CJK Unified Ideographs Extension E": range(178208, 183984),
|
350 |
+
"CJK Unified Ideographs Extension F": range(183984, 191472),
|
351 |
+
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
|
352 |
+
"CJK Unified Ideographs Extension G": range(196608, 201552),
|
353 |
+
"CJK Unified Ideographs Extension H": range(201552, 205744),
|
354 |
+
"Tags": range(917504, 917632),
|
355 |
+
"Variation Selectors Supplement": range(917760, 918000),
|
356 |
+
"Supplementary Private Use Area-A": range(983040, 1048576),
|
357 |
+
"Supplementary Private Use Area-B": range(1048576, 1114112),
|
358 |
+
}
|
359 |
+
|
360 |
+
|
361 |
+
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
|
362 |
+
"Supplement",
|
363 |
+
"Extended",
|
364 |
+
"Extensions",
|
365 |
+
"Modifier",
|
366 |
+
"Marks",
|
367 |
+
"Punctuation",
|
368 |
+
"Symbols",
|
369 |
+
"Forms",
|
370 |
+
"Operators",
|
371 |
+
"Miscellaneous",
|
372 |
+
"Drawing",
|
373 |
+
"Block",
|
374 |
+
"Shapes",
|
375 |
+
"Supplemental",
|
376 |
+
"Tags",
|
377 |
+
]
|
378 |
+
|
379 |
+
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
380 |
+
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
381 |
+
IGNORECASE,
|
382 |
+
)
|
383 |
+
|
384 |
+
IANA_NO_ALIASES = [
|
385 |
+
"cp720",
|
386 |
+
"cp737",
|
387 |
+
"cp856",
|
388 |
+
"cp874",
|
389 |
+
"cp875",
|
390 |
+
"cp1006",
|
391 |
+
"koi8_r",
|
392 |
+
"koi8_t",
|
393 |
+
"koi8_u",
|
394 |
+
]
|
395 |
+
|
396 |
+
IANA_SUPPORTED: list[str] = sorted(
|
397 |
+
filter(
|
398 |
+
lambda x: x.endswith("_codec") is False
|
399 |
+
and x not in {"rot_13", "tactis", "mbcs"},
|
400 |
+
list(set(aliases.values())) + IANA_NO_ALIASES,
|
401 |
+
)
|
402 |
+
)
|
403 |
+
|
404 |
+
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
405 |
+
|
406 |
+
# pre-computed code page that are similar using the function cp_similarity.
|
407 |
+
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
|
408 |
+
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
409 |
+
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
410 |
+
"cp1125": ["cp866"],
|
411 |
+
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
412 |
+
"cp1250": ["iso8859_2"],
|
413 |
+
"cp1251": ["kz1048", "ptcp154"],
|
414 |
+
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
415 |
+
"cp1253": ["iso8859_7"],
|
416 |
+
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
417 |
+
"cp1257": ["iso8859_13"],
|
418 |
+
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
419 |
+
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
420 |
+
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
421 |
+
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
422 |
+
"cp857": ["cp850", "cp858", "cp865"],
|
423 |
+
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
424 |
+
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
425 |
+
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
426 |
+
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
427 |
+
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
428 |
+
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
429 |
+
"cp866": ["cp1125"],
|
430 |
+
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
431 |
+
"iso8859_11": ["tis_620"],
|
432 |
+
"iso8859_13": ["cp1257"],
|
433 |
+
"iso8859_14": [
|
434 |
+
"iso8859_10",
|
435 |
+
"iso8859_15",
|
436 |
+
"iso8859_16",
|
437 |
+
"iso8859_3",
|
438 |
+
"iso8859_9",
|
439 |
+
"latin_1",
|
440 |
+
],
|
441 |
+
"iso8859_15": [
|
442 |
+
"cp1252",
|
443 |
+
"cp1254",
|
444 |
+
"iso8859_10",
|
445 |
+
"iso8859_14",
|
446 |
+
"iso8859_16",
|
447 |
+
"iso8859_3",
|
448 |
+
"iso8859_9",
|
449 |
+
"latin_1",
|
450 |
+
],
|
451 |
+
"iso8859_16": [
|
452 |
+
"iso8859_14",
|
453 |
+
"iso8859_15",
|
454 |
+
"iso8859_2",
|
455 |
+
"iso8859_3",
|
456 |
+
"iso8859_9",
|
457 |
+
"latin_1",
|
458 |
+
],
|
459 |
+
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
460 |
+
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
461 |
+
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
462 |
+
"iso8859_7": ["cp1253"],
|
463 |
+
"iso8859_9": [
|
464 |
+
"cp1252",
|
465 |
+
"cp1254",
|
466 |
+
"cp1258",
|
467 |
+
"iso8859_10",
|
468 |
+
"iso8859_14",
|
469 |
+
"iso8859_15",
|
470 |
+
"iso8859_16",
|
471 |
+
"iso8859_3",
|
472 |
+
"iso8859_4",
|
473 |
+
"latin_1",
|
474 |
+
],
|
475 |
+
"kz1048": ["cp1251", "ptcp154"],
|
476 |
+
"latin_1": [
|
477 |
+
"cp1252",
|
478 |
+
"cp1254",
|
479 |
+
"cp1258",
|
480 |
+
"iso8859_10",
|
481 |
+
"iso8859_14",
|
482 |
+
"iso8859_15",
|
483 |
+
"iso8859_16",
|
484 |
+
"iso8859_3",
|
485 |
+
"iso8859_4",
|
486 |
+
"iso8859_9",
|
487 |
+
],
|
488 |
+
"mac_iceland": ["mac_roman", "mac_turkish"],
|
489 |
+
"mac_roman": ["mac_iceland", "mac_turkish"],
|
490 |
+
"mac_turkish": ["mac_iceland", "mac_roman"],
|
491 |
+
"ptcp154": ["cp1251", "kz1048"],
|
492 |
+
"tis_620": ["iso8859_11"],
|
493 |
+
}
|
494 |
+
|
495 |
+
|
496 |
+
CHARDET_CORRESPONDENCE: dict[str, str] = {
|
497 |
+
"iso2022_kr": "ISO-2022-KR",
|
498 |
+
"iso2022_jp": "ISO-2022-JP",
|
499 |
+
"euc_kr": "EUC-KR",
|
500 |
+
"tis_620": "TIS-620",
|
501 |
+
"utf_32": "UTF-32",
|
502 |
+
"euc_jp": "EUC-JP",
|
503 |
+
"koi8_r": "KOI8-R",
|
504 |
+
"iso8859_1": "ISO-8859-1",
|
505 |
+
"iso8859_2": "ISO-8859-2",
|
506 |
+
"iso8859_5": "ISO-8859-5",
|
507 |
+
"iso8859_6": "ISO-8859-6",
|
508 |
+
"iso8859_7": "ISO-8859-7",
|
509 |
+
"iso8859_8": "ISO-8859-8",
|
510 |
+
"utf_16": "UTF-16",
|
511 |
+
"cp855": "IBM855",
|
512 |
+
"mac_cyrillic": "MacCyrillic",
|
513 |
+
"gb2312": "GB2312",
|
514 |
+
"gb18030": "GB18030",
|
515 |
+
"cp932": "CP932",
|
516 |
+
"cp866": "IBM866",
|
517 |
+
"utf_8": "utf-8",
|
518 |
+
"utf_8_sig": "UTF-8-SIG",
|
519 |
+
"shift_jis": "SHIFT_JIS",
|
520 |
+
"big5": "Big5",
|
521 |
+
"cp1250": "windows-1250",
|
522 |
+
"cp1251": "windows-1251",
|
523 |
+
"cp1252": "Windows-1252",
|
524 |
+
"cp1253": "windows-1253",
|
525 |
+
"cp1255": "windows-1255",
|
526 |
+
"cp1256": "windows-1256",
|
527 |
+
"cp1254": "Windows-1254",
|
528 |
+
"cp949": "CP949",
|
529 |
+
}
|
530 |
+
|
531 |
+
|
532 |
+
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
|
533 |
+
"<",
|
534 |
+
">",
|
535 |
+
"=",
|
536 |
+
":",
|
537 |
+
"/",
|
538 |
+
"&",
|
539 |
+
";",
|
540 |
+
"{",
|
541 |
+
"}",
|
542 |
+
"[",
|
543 |
+
"]",
|
544 |
+
",",
|
545 |
+
"|",
|
546 |
+
'"',
|
547 |
+
"-",
|
548 |
+
"(",
|
549 |
+
")",
|
550 |
+
}
|
551 |
+
|
552 |
+
|
553 |
+
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
|
554 |
+
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
555 |
+
|
556 |
+
# Logging LEVEL below DEBUG
|
557 |
+
TRACE: int = 5
|
558 |
+
|
559 |
+
|
560 |
+
# Language label that contain the em dash "—"
|
561 |
+
# character are to be considered alternative seq to origin
|
562 |
+
FREQUENCIES: dict[str, list[str]] = {
|
563 |
+
"English": [
|
564 |
+
"e",
|
565 |
+
"a",
|
566 |
+
"t",
|
567 |
+
"i",
|
568 |
+
"o",
|
569 |
+
"n",
|
570 |
+
"s",
|
571 |
+
"r",
|
572 |
+
"h",
|
573 |
+
"l",
|
574 |
+
"d",
|
575 |
+
"c",
|
576 |
+
"u",
|
577 |
+
"m",
|
578 |
+
"f",
|
579 |
+
"p",
|
580 |
+
"g",
|
581 |
+
"w",
|
582 |
+
"y",
|
583 |
+
"b",
|
584 |
+
"v",
|
585 |
+
"k",
|
586 |
+
"x",
|
587 |
+
"j",
|
588 |
+
"z",
|
589 |
+
"q",
|
590 |
+
],
|
591 |
+
"English—": [
|
592 |
+
"e",
|
593 |
+
"a",
|
594 |
+
"t",
|
595 |
+
"i",
|
596 |
+
"o",
|
597 |
+
"n",
|
598 |
+
"s",
|
599 |
+
"r",
|
600 |
+
"h",
|
601 |
+
"l",
|
602 |
+
"d",
|
603 |
+
"c",
|
604 |
+
"m",
|
605 |
+
"u",
|
606 |
+
"f",
|
607 |
+
"p",
|
608 |
+
"g",
|
609 |
+
"w",
|
610 |
+
"b",
|
611 |
+
"y",
|
612 |
+
"v",
|
613 |
+
"k",
|
614 |
+
"j",
|
615 |
+
"x",
|
616 |
+
"z",
|
617 |
+
"q",
|
618 |
+
],
|
619 |
+
"German": [
|
620 |
+
"e",
|
621 |
+
"n",
|
622 |
+
"i",
|
623 |
+
"r",
|
624 |
+
"s",
|
625 |
+
"t",
|
626 |
+
"a",
|
627 |
+
"d",
|
628 |
+
"h",
|
629 |
+
"u",
|
630 |
+
"l",
|
631 |
+
"g",
|
632 |
+
"o",
|
633 |
+
"c",
|
634 |
+
"m",
|
635 |
+
"b",
|
636 |
+
"f",
|
637 |
+
"k",
|
638 |
+
"w",
|
639 |
+
"z",
|
640 |
+
"p",
|
641 |
+
"v",
|
642 |
+
"ü",
|
643 |
+
"ä",
|
644 |
+
"ö",
|
645 |
+
"j",
|
646 |
+
],
|
647 |
+
"French": [
|
648 |
+
"e",
|
649 |
+
"a",
|
650 |
+
"s",
|
651 |
+
"n",
|
652 |
+
"i",
|
653 |
+
"t",
|
654 |
+
"r",
|
655 |
+
"l",
|
656 |
+
"u",
|
657 |
+
"o",
|
658 |
+
"d",
|
659 |
+
"c",
|
660 |
+
"p",
|
661 |
+
"m",
|
662 |
+
"é",
|
663 |
+
"v",
|
664 |
+
"g",
|
665 |
+
"f",
|
666 |
+
"b",
|
667 |
+
"h",
|
668 |
+
"q",
|
669 |
+
"à",
|
670 |
+
"x",
|
671 |
+
"è",
|
672 |
+
"y",
|
673 |
+
"j",
|
674 |
+
],
|
675 |
+
"Dutch": [
|
676 |
+
"e",
|
677 |
+
"n",
|
678 |
+
"a",
|
679 |
+
"i",
|
680 |
+
"r",
|
681 |
+
"t",
|
682 |
+
"o",
|
683 |
+
"d",
|
684 |
+
"s",
|
685 |
+
"l",
|
686 |
+
"g",
|
687 |
+
"h",
|
688 |
+
"v",
|
689 |
+
"m",
|
690 |
+
"u",
|
691 |
+
"k",
|
692 |
+
"c",
|
693 |
+
"p",
|
694 |
+
"b",
|
695 |
+
"w",
|
696 |
+
"j",
|
697 |
+
"z",
|
698 |
+
"f",
|
699 |
+
"y",
|
700 |
+
"x",
|
701 |
+
"ë",
|
702 |
+
],
|
703 |
+
"Italian": [
|
704 |
+
"e",
|
705 |
+
"i",
|
706 |
+
"a",
|
707 |
+
"o",
|
708 |
+
"n",
|
709 |
+
"l",
|
710 |
+
"t",
|
711 |
+
"r",
|
712 |
+
"s",
|
713 |
+
"c",
|
714 |
+
"d",
|
715 |
+
"u",
|
716 |
+
"p",
|
717 |
+
"m",
|
718 |
+
"g",
|
719 |
+
"v",
|
720 |
+
"f",
|
721 |
+
"b",
|
722 |
+
"z",
|
723 |
+
"h",
|
724 |
+
"q",
|
725 |
+
"è",
|
726 |
+
"à",
|
727 |
+
"k",
|
728 |
+
"y",
|
729 |
+
"ò",
|
730 |
+
],
|
731 |
+
"Polish": [
|
732 |
+
"a",
|
733 |
+
"i",
|
734 |
+
"o",
|
735 |
+
"e",
|
736 |
+
"n",
|
737 |
+
"r",
|
738 |
+
"z",
|
739 |
+
"w",
|
740 |
+
"s",
|
741 |
+
"c",
|
742 |
+
"t",
|
743 |
+
"k",
|
744 |
+
"y",
|
745 |
+
"d",
|
746 |
+
"p",
|
747 |
+
"m",
|
748 |
+
"u",
|
749 |
+
"l",
|
750 |
+
"j",
|
751 |
+
"ł",
|
752 |
+
"g",
|
753 |
+
"b",
|
754 |
+
"h",
|
755 |
+
"ą",
|
756 |
+
"ę",
|
757 |
+
"ó",
|
758 |
+
],
|
759 |
+
"Spanish": [
|
760 |
+
"e",
|
761 |
+
"a",
|
762 |
+
"o",
|
763 |
+
"n",
|
764 |
+
"s",
|
765 |
+
"r",
|
766 |
+
"i",
|
767 |
+
"l",
|
768 |
+
"d",
|
769 |
+
"t",
|
770 |
+
"c",
|
771 |
+
"u",
|
772 |
+
"m",
|
773 |
+
"p",
|
774 |
+
"b",
|
775 |
+
"g",
|
776 |
+
"v",
|
777 |
+
"f",
|
778 |
+
"y",
|
779 |
+
"ó",
|
780 |
+
"h",
|
781 |
+
"q",
|
782 |
+
"í",
|
783 |
+
"j",
|
784 |
+
"z",
|
785 |
+
"á",
|
786 |
+
],
|
787 |
+
"Russian": [
|
788 |
+
"о",
|
789 |
+
"а",
|
790 |
+
"е",
|
791 |
+
"и",
|
792 |
+
"н",
|
793 |
+
"с",
|
794 |
+
"т",
|
795 |
+
"р",
|
796 |
+
"в",
|
797 |
+
"л",
|
798 |
+
"к",
|
799 |
+
"м",
|
800 |
+
"д",
|
801 |
+
"п",
|
802 |
+
"у",
|
803 |
+
"г",
|
804 |
+
"я",
|
805 |
+
"ы",
|
806 |
+
"з",
|
807 |
+
"б",
|
808 |
+
"й",
|
809 |
+
"ь",
|
810 |
+
"ч",
|
811 |
+
"х",
|
812 |
+
"ж",
|
813 |
+
"ц",
|
814 |
+
],
|
815 |
+
# Jap-Kanji
|
816 |
+
"Japanese": [
|
817 |
+
"人",
|
818 |
+
"一",
|
819 |
+
"大",
|
820 |
+
"亅",
|
821 |
+
"丁",
|
822 |
+
"丨",
|
823 |
+
"竹",
|
824 |
+
"笑",
|
825 |
+
"口",
|
826 |
+
"日",
|
827 |
+
"今",
|
828 |
+
"二",
|
829 |
+
"彳",
|
830 |
+
"行",
|
831 |
+
"十",
|
832 |
+
"土",
|
833 |
+
"丶",
|
834 |
+
"寸",
|
835 |
+
"寺",
|
836 |
+
"時",
|
837 |
+
"乙",
|
838 |
+
"丿",
|
839 |
+
"乂",
|
840 |
+
"气",
|
841 |
+
"気",
|
842 |
+
"冂",
|
843 |
+
"巾",
|
844 |
+
"亠",
|
845 |
+
"市",
|
846 |
+
"目",
|
847 |
+
"儿",
|
848 |
+
"見",
|
849 |
+
"八",
|
850 |
+
"小",
|
851 |
+
"凵",
|
852 |
+
"県",
|
853 |
+
"月",
|
854 |
+
"彐",
|
855 |
+
"門",
|
856 |
+
"間",
|
857 |
+
"木",
|
858 |
+
"東",
|
859 |
+
"山",
|
860 |
+
"出",
|
861 |
+
"本",
|
862 |
+
"中",
|
863 |
+
"刀",
|
864 |
+
"分",
|
865 |
+
"耳",
|
866 |
+
"又",
|
867 |
+
"取",
|
868 |
+
"最",
|
869 |
+
"言",
|
870 |
+
"田",
|
871 |
+
"心",
|
872 |
+
"思",
|
873 |
+
"刂",
|
874 |
+
"前",
|
875 |
+
"京",
|
876 |
+
"尹",
|
877 |
+
"事",
|
878 |
+
"生",
|
879 |
+
"厶",
|
880 |
+
"云",
|
881 |
+
"会",
|
882 |
+
"未",
|
883 |
+
"来",
|
884 |
+
"白",
|
885 |
+
"冫",
|
886 |
+
"楽",
|
887 |
+
"灬",
|
888 |
+
"馬",
|
889 |
+
"尸",
|
890 |
+
"尺",
|
891 |
+
"駅",
|
892 |
+
"明",
|
893 |
+
"耂",
|
894 |
+
"者",
|
895 |
+
"了",
|
896 |
+
"阝",
|
897 |
+
"都",
|
898 |
+
"高",
|
899 |
+
"卜",
|
900 |
+
"占",
|
901 |
+
"厂",
|
902 |
+
"广",
|
903 |
+
"店",
|
904 |
+
"子",
|
905 |
+
"申",
|
906 |
+
"奄",
|
907 |
+
"亻",
|
908 |
+
"俺",
|
909 |
+
"上",
|
910 |
+
"方",
|
911 |
+
"冖",
|
912 |
+
"学",
|
913 |
+
"衣",
|
914 |
+
"艮",
|
915 |
+
"食",
|
916 |
+
"自",
|
917 |
+
],
|
918 |
+
# Jap-Katakana
|
919 |
+
"Japanese—": [
|
920 |
+
"ー",
|
921 |
+
"ン",
|
922 |
+
"ス",
|
923 |
+
"・",
|
924 |
+
"ル",
|
925 |
+
"ト",
|
926 |
+
"リ",
|
927 |
+
"イ",
|
928 |
+
"ア",
|
929 |
+
"ラ",
|
930 |
+
"ッ",
|
931 |
+
"ク",
|
932 |
+
"ド",
|
933 |
+
"シ",
|
934 |
+
"レ",
|
935 |
+
"ジ",
|
936 |
+
"タ",
|
937 |
+
"フ",
|
938 |
+
"ロ",
|
939 |
+
"カ",
|
940 |
+
"テ",
|
941 |
+
"マ",
|
942 |
+
"ィ",
|
943 |
+
"グ",
|
944 |
+
"バ",
|
945 |
+
"ム",
|
946 |
+
"プ",
|
947 |
+
"オ",
|
948 |
+
"コ",
|
949 |
+
"デ",
|
950 |
+
"ニ",
|
951 |
+
"ウ",
|
952 |
+
"メ",
|
953 |
+
"サ",
|
954 |
+
"ビ",
|
955 |
+
"ナ",
|
956 |
+
"ブ",
|
957 |
+
"ャ",
|
958 |
+
"エ",
|
959 |
+
"ュ",
|
960 |
+
"チ",
|
961 |
+
"キ",
|
962 |
+
"ズ",
|
963 |
+
"ダ",
|
964 |
+
"パ",
|
965 |
+
"ミ",
|
966 |
+
"ェ",
|
967 |
+
"ョ",
|
968 |
+
"ハ",
|
969 |
+
"セ",
|
970 |
+
"ベ",
|
971 |
+
"ガ",
|
972 |
+
"モ",
|
973 |
+
"ツ",
|
974 |
+
"ネ",
|
975 |
+
"ボ",
|
976 |
+
"ソ",
|
977 |
+
"ノ",
|
978 |
+
"ァ",
|
979 |
+
"ヴ",
|
980 |
+
"ワ",
|
981 |
+
"ポ",
|
982 |
+
"ペ",
|
983 |
+
"ピ",
|
984 |
+
"ケ",
|
985 |
+
"ゴ",
|
986 |
+
"ギ",
|
987 |
+
"ザ",
|
988 |
+
"ホ",
|
989 |
+
"ゲ",
|
990 |
+
"ォ",
|
991 |
+
"ヤ",
|
992 |
+
"ヒ",
|
993 |
+
"ユ",
|
994 |
+
"ヨ",
|
995 |
+
"ヘ",
|
996 |
+
"ゼ",
|
997 |
+
"ヌ",
|
998 |
+
"ゥ",
|
999 |
+
"ゾ",
|
1000 |
+
"ヶ",
|
1001 |
+
"ヂ",
|
1002 |
+
"ヲ",
|
1003 |
+
"ヅ",
|
1004 |
+
"ヵ",
|
1005 |
+
"ヱ",
|
1006 |
+
"ヰ",
|
1007 |
+
"ヮ",
|
1008 |
+
"ヽ",
|
1009 |
+
"゠",
|
1010 |
+
"ヾ",
|
1011 |
+
"ヷ",
|
1012 |
+
"ヿ",
|
1013 |
+
"ヸ",
|
1014 |
+
"ヹ",
|
1015 |
+
"ヺ",
|
1016 |
+
],
|
1017 |
+
# Jap-Hiragana
|
1018 |
+
"Japanese——": [
|
1019 |
+
"の",
|
1020 |
+
"に",
|
1021 |
+
"る",
|
1022 |
+
"た",
|
1023 |
+
"と",
|
1024 |
+
"は",
|
1025 |
+
"し",
|
1026 |
+
"い",
|
1027 |
+
"を",
|
1028 |
+
"で",
|
1029 |
+
"て",
|
1030 |
+
"が",
|
1031 |
+
"な",
|
1032 |
+
"れ",
|
1033 |
+
"か",
|
1034 |
+
"ら",
|
1035 |
+
"さ",
|
1036 |
+
"っ",
|
1037 |
+
"り",
|
1038 |
+
"す",
|
1039 |
+
"あ",
|
1040 |
+
"も",
|
1041 |
+
"こ",
|
1042 |
+
"ま",
|
1043 |
+
"う",
|
1044 |
+
"く",
|
1045 |
+
"よ",
|
1046 |
+
"き",
|
1047 |
+
"ん",
|
1048 |
+
"め",
|
1049 |
+
"お",
|
1050 |
+
"け",
|
1051 |
+
"そ",
|
1052 |
+
"つ",
|
1053 |
+
"だ",
|
1054 |
+
"や",
|
1055 |
+
"え",
|
1056 |
+
"ど",
|
1057 |
+
"わ",
|
1058 |
+
"ち",
|
1059 |
+
"み",
|
1060 |
+
"せ",
|
1061 |
+
"じ",
|
1062 |
+
"ば",
|
1063 |
+
"へ",
|
1064 |
+
"び",
|
1065 |
+
"ず",
|
1066 |
+
"ろ",
|
1067 |
+
"ほ",
|
1068 |
+
"げ",
|
1069 |
+
"む",
|
1070 |
+
"べ",
|
1071 |
+
"ひ",
|
1072 |
+
"ょ",
|
1073 |
+
"ゆ",
|
1074 |
+
"ぶ",
|
1075 |
+
"ご",
|
1076 |
+
"ゃ",
|
1077 |
+
"ね",
|
1078 |
+
"ふ",
|
1079 |
+
"ぐ",
|
1080 |
+
"ぎ",
|
1081 |
+
"ぼ",
|
1082 |
+
"ゅ",
|
1083 |
+
"づ",
|
1084 |
+
"ざ",
|
1085 |
+
"ぞ",
|
1086 |
+
"ぬ",
|
1087 |
+
"ぜ",
|
1088 |
+
"ぱ",
|
1089 |
+
"ぽ",
|
1090 |
+
"ぷ",
|
1091 |
+
"ぴ",
|
1092 |
+
"ぃ",
|
1093 |
+
"ぁ",
|
1094 |
+
"ぇ",
|
1095 |
+
"ぺ",
|
1096 |
+
"ゞ",
|
1097 |
+
"ぢ",
|
1098 |
+
"ぉ",
|
1099 |
+
"ぅ",
|
1100 |
+
"ゐ",
|
1101 |
+
"ゝ",
|
1102 |
+
"ゑ",
|
1103 |
+
"゛",
|
1104 |
+
"゜",
|
1105 |
+
"ゎ",
|
1106 |
+
"ゔ",
|
1107 |
+
"゚",
|
1108 |
+
"ゟ",
|
1109 |
+
"゙",
|
1110 |
+
"ゕ",
|
1111 |
+
"ゖ",
|
1112 |
+
],
|
1113 |
+
"Portuguese": [
|
1114 |
+
"a",
|
1115 |
+
"e",
|
1116 |
+
"o",
|
1117 |
+
"s",
|
1118 |
+
"i",
|
1119 |
+
"r",
|
1120 |
+
"d",
|
1121 |
+
"n",
|
1122 |
+
"t",
|
1123 |
+
"m",
|
1124 |
+
"u",
|
1125 |
+
"c",
|
1126 |
+
"l",
|
1127 |
+
"p",
|
1128 |
+
"g",
|
1129 |
+
"v",
|
1130 |
+
"b",
|
1131 |
+
"f",
|
1132 |
+
"h",
|
1133 |
+
"ã",
|
1134 |
+
"q",
|
1135 |
+
"é",
|
1136 |
+
"ç",
|
1137 |
+
"á",
|
1138 |
+
"z",
|
1139 |
+
"í",
|
1140 |
+
],
|
1141 |
+
"Swedish": [
|
1142 |
+
"e",
|
1143 |
+
"a",
|
1144 |
+
"n",
|
1145 |
+
"r",
|
1146 |
+
"t",
|
1147 |
+
"s",
|
1148 |
+
"i",
|
1149 |
+
"l",
|
1150 |
+
"d",
|
1151 |
+
"o",
|
1152 |
+
"m",
|
1153 |
+
"k",
|
1154 |
+
"g",
|
1155 |
+
"v",
|
1156 |
+
"h",
|
1157 |
+
"f",
|
1158 |
+
"u",
|
1159 |
+
"p",
|
1160 |
+
"ä",
|
1161 |
+
"c",
|
1162 |
+
"b",
|
1163 |
+
"ö",
|
1164 |
+
"å",
|
1165 |
+
"y",
|
1166 |
+
"j",
|
1167 |
+
"x",
|
1168 |
+
],
|
1169 |
+
"Chinese": [
|
1170 |
+
"的",
|
1171 |
+
"一",
|
1172 |
+
"是",
|
1173 |
+
"不",
|
1174 |
+
"了",
|
1175 |
+
"在",
|
1176 |
+
"人",
|
1177 |
+
"有",
|
1178 |
+
"我",
|
1179 |
+
"他",
|
1180 |
+
"这",
|
1181 |
+
"个",
|
1182 |
+
"们",
|
1183 |
+
"中",
|
1184 |
+
"来",
|
1185 |
+
"上",
|
1186 |
+
"大",
|
1187 |
+
"为",
|
1188 |
+
"和",
|
1189 |
+
"国",
|
1190 |
+
"地",
|
1191 |
+
"到",
|
1192 |
+
"以",
|
1193 |
+
"说",
|
1194 |
+
"时",
|
1195 |
+
"要",
|
1196 |
+
"就",
|
1197 |
+
"出",
|
1198 |
+
"会",
|
1199 |
+
"可",
|
1200 |
+
"也",
|
1201 |
+
"你",
|
1202 |
+
"对",
|
1203 |
+
"生",
|
1204 |
+
"能",
|
1205 |
+
"而",
|
1206 |
+
"子",
|
1207 |
+
"那",
|
1208 |
+
"得",
|
1209 |
+
"于",
|
1210 |
+
"着",
|
1211 |
+
"下",
|
1212 |
+
"自",
|
1213 |
+
"之",
|
1214 |
+
"年",
|
1215 |
+
"过",
|
1216 |
+
"发",
|
1217 |
+
"后",
|
1218 |
+
"作",
|
1219 |
+
"里",
|
1220 |
+
"用",
|
1221 |
+
"道",
|
1222 |
+
"行",
|
1223 |
+
"所",
|
1224 |
+
"然",
|
1225 |
+
"家",
|
1226 |
+
"种",
|
1227 |
+
"事",
|
1228 |
+
"成",
|
1229 |
+
"方",
|
1230 |
+
"多",
|
1231 |
+
"经",
|
1232 |
+
"么",
|
1233 |
+
"去",
|
1234 |
+
"法",
|
1235 |
+
"学",
|
1236 |
+
"如",
|
1237 |
+
"都",
|
1238 |
+
"同",
|
1239 |
+
"现",
|
1240 |
+
"当",
|
1241 |
+
"没",
|
1242 |
+
"动",
|
1243 |
+
"面",
|
1244 |
+
"起",
|
1245 |
+
"看",
|
1246 |
+
"定",
|
1247 |
+
"天",
|
1248 |
+
"分",
|
1249 |
+
"还",
|
1250 |
+
"进",
|
1251 |
+
"好",
|
1252 |
+
"小",
|
1253 |
+
"部",
|
1254 |
+
"其",
|
1255 |
+
"些",
|
1256 |
+
"主",
|
1257 |
+
"样",
|
1258 |
+
"理",
|
1259 |
+
"心",
|
1260 |
+
"她",
|
1261 |
+
"本",
|
1262 |
+
"前",
|
1263 |
+
"开",
|
1264 |
+
"但",
|
1265 |
+
"因",
|
1266 |
+
"只",
|
1267 |
+
"从",
|
1268 |
+
"想",
|
1269 |
+
"实",
|
1270 |
+
],
|
1271 |
+
"Ukrainian": [
|
1272 |
+
"о",
|
1273 |
+
"а",
|
1274 |
+
"н",
|
1275 |
+
"і",
|
1276 |
+
"и",
|
1277 |
+
"р",
|
1278 |
+
"в",
|
1279 |
+
"т",
|
1280 |
+
"е",
|
1281 |
+
"с",
|
1282 |
+
"к",
|
1283 |
+
"л",
|
1284 |
+
"у",
|
1285 |
+
"д",
|
1286 |
+
"м",
|
1287 |
+
"п",
|
1288 |
+
"з",
|
1289 |
+
"я",
|
1290 |
+
"ь",
|
1291 |
+
"б",
|
1292 |
+
"г",
|
1293 |
+
"й",
|
1294 |
+
"ч",
|
1295 |
+
"х",
|
1296 |
+
"ц",
|
1297 |
+
"ї",
|
1298 |
+
],
|
1299 |
+
"Norwegian": [
|
1300 |
+
"e",
|
1301 |
+
"r",
|
1302 |
+
"n",
|
1303 |
+
"t",
|
1304 |
+
"a",
|
1305 |
+
"s",
|
1306 |
+
"i",
|
1307 |
+
"o",
|
1308 |
+
"l",
|
1309 |
+
"d",
|
1310 |
+
"g",
|
1311 |
+
"k",
|
1312 |
+
"m",
|
1313 |
+
"v",
|
1314 |
+
"f",
|
1315 |
+
"p",
|
1316 |
+
"u",
|
1317 |
+
"b",
|
1318 |
+
"h",
|
1319 |
+
"å",
|
1320 |
+
"y",
|
1321 |
+
"j",
|
1322 |
+
"ø",
|
1323 |
+
"c",
|
1324 |
+
"æ",
|
1325 |
+
"w",
|
1326 |
+
],
|
1327 |
+
"Finnish": [
|
1328 |
+
"a",
|
1329 |
+
"i",
|
1330 |
+
"n",
|
1331 |
+
"t",
|
1332 |
+
"e",
|
1333 |
+
"s",
|
1334 |
+
"l",
|
1335 |
+
"o",
|
1336 |
+
"u",
|
1337 |
+
"k",
|
1338 |
+
"ä",
|
1339 |
+
"m",
|
1340 |
+
"r",
|
1341 |
+
"v",
|
1342 |
+
"j",
|
1343 |
+
"h",
|
1344 |
+
"p",
|
1345 |
+
"y",
|
1346 |
+
"d",
|
1347 |
+
"ö",
|
1348 |
+
"g",
|
1349 |
+
"c",
|
1350 |
+
"b",
|
1351 |
+
"f",
|
1352 |
+
"w",
|
1353 |
+
"z",
|
1354 |
+
],
|
1355 |
+
"Vietnamese": [
|
1356 |
+
"n",
|
1357 |
+
"h",
|
1358 |
+
"t",
|
1359 |
+
"i",
|
1360 |
+
"c",
|
1361 |
+
"g",
|
1362 |
+
"a",
|
1363 |
+
"o",
|
1364 |
+
"u",
|
1365 |
+
"m",
|
1366 |
+
"l",
|
1367 |
+
"r",
|
1368 |
+
"à",
|
1369 |
+
"đ",
|
1370 |
+
"s",
|
1371 |
+
"e",
|
1372 |
+
"v",
|
1373 |
+
"p",
|
1374 |
+
"b",
|
1375 |
+
"y",
|
1376 |
+
"ư",
|
1377 |
+
"d",
|
1378 |
+
"á",
|
1379 |
+
"k",
|
1380 |
+
"ộ",
|
1381 |
+
"ế",
|
1382 |
+
],
|
1383 |
+
"Czech": [
|
1384 |
+
"o",
|
1385 |
+
"e",
|
1386 |
+
"a",
|
1387 |
+
"n",
|
1388 |
+
"t",
|
1389 |
+
"s",
|
1390 |
+
"i",
|
1391 |
+
"l",
|
1392 |
+
"v",
|
1393 |
+
"r",
|
1394 |
+
"k",
|
1395 |
+
"d",
|
1396 |
+
"u",
|
1397 |
+
"m",
|
1398 |
+
"p",
|
1399 |
+
"í",
|
1400 |
+
"c",
|
1401 |
+
"h",
|
1402 |
+
"z",
|
1403 |
+
"á",
|
1404 |
+
"y",
|
1405 |
+
"j",
|
1406 |
+
"b",
|
1407 |
+
"ě",
|
1408 |
+
"é",
|
1409 |
+
"ř",
|
1410 |
+
],
|
1411 |
+
"Hungarian": [
|
1412 |
+
"e",
|
1413 |
+
"a",
|
1414 |
+
"t",
|
1415 |
+
"l",
|
1416 |
+
"s",
|
1417 |
+
"n",
|
1418 |
+
"k",
|
1419 |
+
"r",
|
1420 |
+
"i",
|
1421 |
+
"o",
|
1422 |
+
"z",
|
1423 |
+
"á",
|
1424 |
+
"é",
|
1425 |
+
"g",
|
1426 |
+
"m",
|
1427 |
+
"b",
|
1428 |
+
"y",
|
1429 |
+
"v",
|
1430 |
+
"d",
|
1431 |
+
"h",
|
1432 |
+
"u",
|
1433 |
+
"p",
|
1434 |
+
"j",
|
1435 |
+
"ö",
|
1436 |
+
"f",
|
1437 |
+
"c",
|
1438 |
+
],
|
1439 |
+
"Korean": [
|
1440 |
+
"이",
|
1441 |
+
"다",
|
1442 |
+
"에",
|
1443 |
+
"의",
|
1444 |
+
"는",
|
1445 |
+
"로",
|
1446 |
+
"하",
|
1447 |
+
"을",
|
1448 |
+
"가",
|
1449 |
+
"고",
|
1450 |
+
"지",
|
1451 |
+
"서",
|
1452 |
+
"한",
|
1453 |
+
"은",
|
1454 |
+
"기",
|
1455 |
+
"으",
|
1456 |
+
"년",
|
1457 |
+
"대",
|
1458 |
+
"사",
|
1459 |
+
"시",
|
1460 |
+
"를",
|
1461 |
+
"리",
|
1462 |
+
"도",
|
1463 |
+
"인",
|
1464 |
+
"스",
|
1465 |
+
"일",
|
1466 |
+
],
|
1467 |
+
"Indonesian": [
|
1468 |
+
"a",
|
1469 |
+
"n",
|
1470 |
+
"e",
|
1471 |
+
"i",
|
1472 |
+
"r",
|
1473 |
+
"t",
|
1474 |
+
"u",
|
1475 |
+
"s",
|
1476 |
+
"d",
|
1477 |
+
"k",
|
1478 |
+
"m",
|
1479 |
+
"l",
|
1480 |
+
"g",
|
1481 |
+
"p",
|
1482 |
+
"b",
|
1483 |
+
"o",
|
1484 |
+
"h",
|
1485 |
+
"y",
|
1486 |
+
"j",
|
1487 |
+
"c",
|
1488 |
+
"w",
|
1489 |
+
"f",
|
1490 |
+
"v",
|
1491 |
+
"z",
|
1492 |
+
"x",
|
1493 |
+
"q",
|
1494 |
+
],
|
1495 |
+
"Turkish": [
|
1496 |
+
"a",
|
1497 |
+
"e",
|
1498 |
+
"i",
|
1499 |
+
"n",
|
1500 |
+
"r",
|
1501 |
+
"l",
|
1502 |
+
"ı",
|
1503 |
+
"k",
|
1504 |
+
"d",
|
1505 |
+
"t",
|
1506 |
+
"s",
|
1507 |
+
"m",
|
1508 |
+
"y",
|
1509 |
+
"u",
|
1510 |
+
"o",
|
1511 |
+
"b",
|
1512 |
+
"ü",
|
1513 |
+
"ş",
|
1514 |
+
"v",
|
1515 |
+
"g",
|
1516 |
+
"z",
|
1517 |
+
"h",
|
1518 |
+
"c",
|
1519 |
+
"p",
|
1520 |
+
"ç",
|
1521 |
+
"ğ",
|
1522 |
+
],
|
1523 |
+
"Romanian": [
|
1524 |
+
"e",
|
1525 |
+
"i",
|
1526 |
+
"a",
|
1527 |
+
"r",
|
1528 |
+
"n",
|
1529 |
+
"t",
|
1530 |
+
"u",
|
1531 |
+
"l",
|
1532 |
+
"o",
|
1533 |
+
"c",
|
1534 |
+
"s",
|
1535 |
+
"d",
|
1536 |
+
"p",
|
1537 |
+
"m",
|
1538 |
+
"ă",
|
1539 |
+
"f",
|
1540 |
+
"v",
|
1541 |
+
"î",
|
1542 |
+
"g",
|
1543 |
+
"b",
|
1544 |
+
"ș",
|
1545 |
+
"ț",
|
1546 |
+
"z",
|
1547 |
+
"h",
|
1548 |
+
"â",
|
1549 |
+
"j",
|
1550 |
+
],
|
1551 |
+
"Farsi": [
|
1552 |
+
"ا",
|
1553 |
+
"ی",
|
1554 |
+
"ر",
|
1555 |
+
"د",
|
1556 |
+
"ن",
|
1557 |
+
"ه",
|
1558 |
+
"و",
|
1559 |
+
"م",
|
1560 |
+
"ت",
|
1561 |
+
"ب",
|
1562 |
+
"س",
|
1563 |
+
"ل",
|
1564 |
+
"ک",
|
1565 |
+
"ش",
|
1566 |
+
"ز",
|
1567 |
+
"ف",
|
1568 |
+
"گ",
|
1569 |
+
"ع",
|
1570 |
+
"خ",
|
1571 |
+
"ق",
|
1572 |
+
"ج",
|
1573 |
+
"آ",
|
1574 |
+
"پ",
|
1575 |
+
"ح",
|
1576 |
+
"ط",
|
1577 |
+
"ص",
|
1578 |
+
],
|
1579 |
+
"Arabic": [
|
1580 |
+
"ا",
|
1581 |
+
"ل",
|
1582 |
+
"ي",
|
1583 |
+
"م",
|
1584 |
+
"و",
|
1585 |
+
"ن",
|
1586 |
+
"ر",
|
1587 |
+
"ت",
|
1588 |
+
"ب",
|
1589 |
+
"ة",
|
1590 |
+
"ع",
|
1591 |
+
"د",
|
1592 |
+
"س",
|
1593 |
+
"ف",
|
1594 |
+
"ه",
|
1595 |
+
"ك",
|
1596 |
+
"ق",
|
1597 |
+
"أ",
|
1598 |
+
"ح",
|
1599 |
+
"ج",
|
1600 |
+
"ش",
|
1601 |
+
"ط",
|
1602 |
+
"ص",
|
1603 |
+
"ى",
|
1604 |
+
"خ",
|
1605 |
+
"إ",
|
1606 |
+
],
|
1607 |
+
"Danish": [
|
1608 |
+
"e",
|
1609 |
+
"r",
|
1610 |
+
"n",
|
1611 |
+
"t",
|
1612 |
+
"a",
|
1613 |
+
"i",
|
1614 |
+
"s",
|
1615 |
+
"d",
|
1616 |
+
"l",
|
1617 |
+
"o",
|
1618 |
+
"g",
|
1619 |
+
"m",
|
1620 |
+
"k",
|
1621 |
+
"f",
|
1622 |
+
"v",
|
1623 |
+
"u",
|
1624 |
+
"b",
|
1625 |
+
"h",
|
1626 |
+
"p",
|
1627 |
+
"å",
|
1628 |
+
"y",
|
1629 |
+
"ø",
|
1630 |
+
"æ",
|
1631 |
+
"c",
|
1632 |
+
"j",
|
1633 |
+
"w",
|
1634 |
+
],
|
1635 |
+
"Serbian": [
|
1636 |
+
"а",
|
1637 |
+
"и",
|
1638 |
+
"о",
|
1639 |
+
"е",
|
1640 |
+
"н",
|
1641 |
+
"р",
|
1642 |
+
"с",
|
1643 |
+
"у",
|
1644 |
+
"т",
|
1645 |
+
"к",
|
1646 |
+
"ј",
|
1647 |
+
"в",
|
1648 |
+
"д",
|
1649 |
+
"м",
|
1650 |
+
"п",
|
1651 |
+
"л",
|
1652 |
+
"г",
|
1653 |
+
"з",
|
1654 |
+
"б",
|
1655 |
+
"a",
|
1656 |
+
"i",
|
1657 |
+
"e",
|
1658 |
+
"o",
|
1659 |
+
"n",
|
1660 |
+
"ц",
|
1661 |
+
"ш",
|
1662 |
+
],
|
1663 |
+
"Lithuanian": [
|
1664 |
+
"i",
|
1665 |
+
"a",
|
1666 |
+
"s",
|
1667 |
+
"o",
|
1668 |
+
"r",
|
1669 |
+
"e",
|
1670 |
+
"t",
|
1671 |
+
"n",
|
1672 |
+
"u",
|
1673 |
+
"k",
|
1674 |
+
"m",
|
1675 |
+
"l",
|
1676 |
+
"p",
|
1677 |
+
"v",
|
1678 |
+
"d",
|
1679 |
+
"j",
|
1680 |
+
"g",
|
1681 |
+
"ė",
|
1682 |
+
"b",
|
1683 |
+
"y",
|
1684 |
+
"ų",
|
1685 |
+
"š",
|
1686 |
+
"ž",
|
1687 |
+
"c",
|
1688 |
+
"ą",
|
1689 |
+
"į",
|
1690 |
+
],
|
1691 |
+
"Slovene": [
|
1692 |
+
"e",
|
1693 |
+
"a",
|
1694 |
+
"i",
|
1695 |
+
"o",
|
1696 |
+
"n",
|
1697 |
+
"r",
|
1698 |
+
"s",
|
1699 |
+
"l",
|
1700 |
+
"t",
|
1701 |
+
"j",
|
1702 |
+
"v",
|
1703 |
+
"k",
|
1704 |
+
"d",
|
1705 |
+
"p",
|
1706 |
+
"m",
|
1707 |
+
"u",
|
1708 |
+
"z",
|
1709 |
+
"b",
|
1710 |
+
"g",
|
1711 |
+
"h",
|
1712 |
+
"č",
|
1713 |
+
"c",
|
1714 |
+
"š",
|
1715 |
+
"ž",
|
1716 |
+
"f",
|
1717 |
+
"y",
|
1718 |
+
],
|
1719 |
+
"Slovak": [
|
1720 |
+
"o",
|
1721 |
+
"a",
|
1722 |
+
"e",
|
1723 |
+
"n",
|
1724 |
+
"i",
|
1725 |
+
"r",
|
1726 |
+
"v",
|
1727 |
+
"t",
|
1728 |
+
"s",
|
1729 |
+
"l",
|
1730 |
+
"k",
|
1731 |
+
"d",
|
1732 |
+
"m",
|
1733 |
+
"p",
|
1734 |
+
"u",
|
1735 |
+
"c",
|
1736 |
+
"h",
|
1737 |
+
"j",
|
1738 |
+
"b",
|
1739 |
+
"z",
|
1740 |
+
"á",
|
1741 |
+
"y",
|
1742 |
+
"ý",
|
1743 |
+
"í",
|
1744 |
+
"č",
|
1745 |
+
"é",
|
1746 |
+
],
|
1747 |
+
"Hebrew": [
|
1748 |
+
"י",
|
1749 |
+
"ו",
|
1750 |
+
"ה",
|
1751 |
+
"ל",
|
1752 |
+
"ר",
|
1753 |
+
"ב",
|
1754 |
+
"ת",
|
1755 |
+
"מ",
|
1756 |
+
"א",
|
1757 |
+
"ש",
|
1758 |
+
"נ",
|
1759 |
+
"ע",
|
1760 |
+
"ם",
|
1761 |
+
"ד",
|
1762 |
+
"ק",
|
1763 |
+
"ח",
|
1764 |
+
"פ",
|
1765 |
+
"ס",
|
1766 |
+
"כ",
|
1767 |
+
"ג",
|
1768 |
+
"ט",
|
1769 |
+
"צ",
|
1770 |
+
"ן",
|
1771 |
+
"ז",
|
1772 |
+
"ך",
|
1773 |
+
],
|
1774 |
+
"Bulgarian": [
|
1775 |
+
"а",
|
1776 |
+
"и",
|
1777 |
+
"о",
|
1778 |
+
"е",
|
1779 |
+
"н",
|
1780 |
+
"т",
|
1781 |
+
"р",
|
1782 |
+
"с",
|
1783 |
+
"в",
|
1784 |
+
"л",
|
1785 |
+
"к",
|
1786 |
+
"д",
|
1787 |
+
"п",
|
1788 |
+
"м",
|
1789 |
+
"з",
|
1790 |
+
"г",
|
1791 |
+
"я",
|
1792 |
+
"ъ",
|
1793 |
+
"у",
|
1794 |
+
"б",
|
1795 |
+
"ч",
|
1796 |
+
"ц",
|
1797 |
+
"й",
|
1798 |
+
"ж",
|
1799 |
+
"щ",
|
1800 |
+
"х",
|
1801 |
+
],
|
1802 |
+
"Croatian": [
|
1803 |
+
"a",
|
1804 |
+
"i",
|
1805 |
+
"o",
|
1806 |
+
"e",
|
1807 |
+
"n",
|
1808 |
+
"r",
|
1809 |
+
"j",
|
1810 |
+
"s",
|
1811 |
+
"t",
|
1812 |
+
"u",
|
1813 |
+
"k",
|
1814 |
+
"l",
|
1815 |
+
"v",
|
1816 |
+
"d",
|
1817 |
+
"m",
|
1818 |
+
"p",
|
1819 |
+
"g",
|
1820 |
+
"z",
|
1821 |
+
"b",
|
1822 |
+
"c",
|
1823 |
+
"č",
|
1824 |
+
"h",
|
1825 |
+
"š",
|
1826 |
+
"ž",
|
1827 |
+
"ć",
|
1828 |
+
"f",
|
1829 |
+
],
|
1830 |
+
"Hindi": [
|
1831 |
+
"क",
|
1832 |
+
"र",
|
1833 |
+
"स",
|
1834 |
+
"न",
|
1835 |
+
"त",
|
1836 |
+
"म",
|
1837 |
+
"ह",
|
1838 |
+
"प",
|
1839 |
+
"य",
|
1840 |
+
"ल",
|
1841 |
+
"व",
|
1842 |
+
"ज",
|
1843 |
+
"द",
|
1844 |
+
"ग",
|
1845 |
+
"ब",
|
1846 |
+
"श",
|
1847 |
+
"ट",
|
1848 |
+
"अ",
|
1849 |
+
"ए",
|
1850 |
+
"थ",
|
1851 |
+
"भ",
|
1852 |
+
"ड",
|
1853 |
+
"च",
|
1854 |
+
"ध",
|
1855 |
+
"ष",
|
1856 |
+
"इ",
|
1857 |
+
],
|
1858 |
+
"Estonian": [
|
1859 |
+
"a",
|
1860 |
+
"i",
|
1861 |
+
"e",
|
1862 |
+
"s",
|
1863 |
+
"t",
|
1864 |
+
"l",
|
1865 |
+
"u",
|
1866 |
+
"n",
|
1867 |
+
"o",
|
1868 |
+
"k",
|
1869 |
+
"r",
|
1870 |
+
"d",
|
1871 |
+
"m",
|
1872 |
+
"v",
|
1873 |
+
"g",
|
1874 |
+
"p",
|
1875 |
+
"j",
|
1876 |
+
"h",
|
1877 |
+
"ä",
|
1878 |
+
"b",
|
1879 |
+
"õ",
|
1880 |
+
"ü",
|
1881 |
+
"f",
|
1882 |
+
"c",
|
1883 |
+
"ö",
|
1884 |
+
"y",
|
1885 |
+
],
|
1886 |
+
"Thai": [
|
1887 |
+
"า",
|
1888 |
+
"น",
|
1889 |
+
"ร",
|
1890 |
+
"อ",
|
1891 |
+
"ก",
|
1892 |
+
"เ",
|
1893 |
+
"ง",
|
1894 |
+
"ม",
|
1895 |
+
"ย",
|
1896 |
+
"ล",
|
1897 |
+
"ว",
|
1898 |
+
"ด",
|
1899 |
+
"ท",
|
1900 |
+
"ส",
|
1901 |
+
"ต",
|
1902 |
+
"ะ",
|
1903 |
+
"ป",
|
1904 |
+
"บ",
|
1905 |
+
"ค",
|
1906 |
+
"ห",
|
1907 |
+
"แ",
|
1908 |
+
"จ",
|
1909 |
+
"พ",
|
1910 |
+
"ช",
|
1911 |
+
"ข",
|
1912 |
+
"ใ",
|
1913 |
+
],
|
1914 |
+
"Greek": [
|
1915 |
+
"α",
|
1916 |
+
"τ",
|
1917 |
+
"ο",
|
1918 |
+
"ι",
|
1919 |
+
"ε",
|
1920 |
+
"ν",
|
1921 |
+
"ρ",
|
1922 |
+
"σ",
|
1923 |
+
"κ",
|
1924 |
+
"η",
|
1925 |
+
"π",
|
1926 |
+
"ς",
|
1927 |
+
"υ",
|
1928 |
+
"μ",
|
1929 |
+
"λ",
|
1930 |
+
"ί",
|
1931 |
+
"ό",
|
1932 |
+
"ά",
|
1933 |
+
"γ",
|
1934 |
+
"έ",
|
1935 |
+
"δ",
|
1936 |
+
"ή",
|
1937 |
+
"ω",
|
1938 |
+
"χ",
|
1939 |
+
"θ",
|
1940 |
+
"ύ",
|
1941 |
+
],
|
1942 |
+
"Tamil": [
|
1943 |
+
"க",
|
1944 |
+
"த",
|
1945 |
+
"ப",
|
1946 |
+
"ட",
|
1947 |
+
"ர",
|
1948 |
+
"ம",
|
1949 |
+
"ல",
|
1950 |
+
"ன",
|
1951 |
+
"வ",
|
1952 |
+
"ற",
|
1953 |
+
"ய",
|
1954 |
+
"ள",
|
1955 |
+
"ச",
|
1956 |
+
"ந",
|
1957 |
+
"இ",
|
1958 |
+
"ண",
|
1959 |
+
"அ",
|
1960 |
+
"ஆ",
|
1961 |
+
"ழ",
|
1962 |
+
"ங",
|
1963 |
+
"எ",
|
1964 |
+
"உ",
|
1965 |
+
"ஒ",
|
1966 |
+
"ஸ",
|
1967 |
+
],
|
1968 |
+
"Kazakh": [
|
1969 |
+
"а",
|
1970 |
+
"ы",
|
1971 |
+
"е",
|
1972 |
+
"н",
|
1973 |
+
"т",
|
1974 |
+
"р",
|
1975 |
+
"л",
|
1976 |
+
"і",
|
1977 |
+
"д",
|
1978 |
+
"с",
|
1979 |
+
"м",
|
1980 |
+
"қ",
|
1981 |
+
"к",
|
1982 |
+
"о",
|
1983 |
+
"б",
|
1984 |
+
"и",
|
1985 |
+
"у",
|
1986 |
+
"ғ",
|
1987 |
+
"ж",
|
1988 |
+
"ң",
|
1989 |
+
"з",
|
1990 |
+
"ш",
|
1991 |
+
"й",
|
1992 |
+
"п",
|
1993 |
+
"г",
|
1994 |
+
"ө",
|
1995 |
+
],
|
1996 |
+
}
|
1997 |
+
|
1998 |
+
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
env/Lib/site-packages/charset_normalizer/legacy.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from typing import TYPE_CHECKING, Any
|
4 |
+
from warnings import warn
|
5 |
+
|
6 |
+
from .api import from_bytes
|
7 |
+
from .constant import CHARDET_CORRESPONDENCE
|
8 |
+
|
9 |
+
# TODO: remove this check when dropping Python 3.7 support
|
10 |
+
if TYPE_CHECKING:
|
11 |
+
from typing_extensions import TypedDict
|
12 |
+
|
13 |
+
class ResultDict(TypedDict):
|
14 |
+
encoding: str | None
|
15 |
+
language: str
|
16 |
+
confidence: float | None
|
17 |
+
|
18 |
+
|
19 |
+
def detect(
|
20 |
+
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
21 |
+
) -> ResultDict:
|
22 |
+
"""
|
23 |
+
chardet legacy method
|
24 |
+
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
25 |
+
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
26 |
+
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
27 |
+
further information. Not planned for removal.
|
28 |
+
|
29 |
+
:param byte_str: The byte sequence to examine.
|
30 |
+
:param should_rename_legacy: Should we rename legacy encodings
|
31 |
+
to their more modern equivalents?
|
32 |
+
"""
|
33 |
+
if len(kwargs):
|
34 |
+
warn(
|
35 |
+
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
36 |
+
)
|
37 |
+
|
38 |
+
if not isinstance(byte_str, (bytearray, bytes)):
|
39 |
+
raise TypeError( # pragma: nocover
|
40 |
+
"Expected object of type bytes or bytearray, got: " "{}".format(
|
41 |
+
type(byte_str)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
|
45 |
+
if isinstance(byte_str, bytearray):
|
46 |
+
byte_str = bytes(byte_str)
|
47 |
+
|
48 |
+
r = from_bytes(byte_str).best()
|
49 |
+
|
50 |
+
encoding = r.encoding if r is not None else None
|
51 |
+
language = r.language if r is not None and r.language != "Unknown" else ""
|
52 |
+
confidence = 1.0 - r.chaos if r is not None else None
|
53 |
+
|
54 |
+
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
55 |
+
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
56 |
+
if r is not None and encoding == "utf_8" and r.bom:
|
57 |
+
encoding += "_sig"
|
58 |
+
|
59 |
+
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
60 |
+
encoding = CHARDET_CORRESPONDENCE[encoding]
|
61 |
+
|
62 |
+
return {
|
63 |
+
"encoding": encoding,
|
64 |
+
"language": language,
|
65 |
+
"confidence": confidence,
|
66 |
+
}
|
env/Lib/site-packages/charset_normalizer/md.py
ADDED
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from functools import lru_cache
|
4 |
+
from logging import getLogger
|
5 |
+
|
6 |
+
from .constant import (
|
7 |
+
COMMON_SAFE_ASCII_CHARACTERS,
|
8 |
+
TRACE,
|
9 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
10 |
+
)
|
11 |
+
from .utils import (
|
12 |
+
is_accentuated,
|
13 |
+
is_arabic,
|
14 |
+
is_arabic_isolated_form,
|
15 |
+
is_case_variable,
|
16 |
+
is_cjk,
|
17 |
+
is_emoticon,
|
18 |
+
is_hangul,
|
19 |
+
is_hiragana,
|
20 |
+
is_katakana,
|
21 |
+
is_latin,
|
22 |
+
is_punctuation,
|
23 |
+
is_separator,
|
24 |
+
is_symbol,
|
25 |
+
is_thai,
|
26 |
+
is_unprintable,
|
27 |
+
remove_accent,
|
28 |
+
unicode_range,
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
class MessDetectorPlugin:
|
33 |
+
"""
|
34 |
+
Base abstract class used for mess detection plugins.
|
35 |
+
All detectors MUST extend and implement given methods.
|
36 |
+
"""
|
37 |
+
|
38 |
+
def eligible(self, character: str) -> bool:
|
39 |
+
"""
|
40 |
+
Determine if given character should be fed in.
|
41 |
+
"""
|
42 |
+
raise NotImplementedError # pragma: nocover
|
43 |
+
|
44 |
+
def feed(self, character: str) -> None:
|
45 |
+
"""
|
46 |
+
The main routine to be executed upon character.
|
47 |
+
Insert the logic in witch the text would be considered chaotic.
|
48 |
+
"""
|
49 |
+
raise NotImplementedError # pragma: nocover
|
50 |
+
|
51 |
+
def reset(self) -> None: # pragma: no cover
|
52 |
+
"""
|
53 |
+
Permit to reset the plugin to the initial state.
|
54 |
+
"""
|
55 |
+
raise NotImplementedError
|
56 |
+
|
57 |
+
@property
|
58 |
+
def ratio(self) -> float:
|
59 |
+
"""
|
60 |
+
Compute the chaos ratio based on what your feed() has seen.
|
61 |
+
Must NOT be lower than 0.; No restriction gt 0.
|
62 |
+
"""
|
63 |
+
raise NotImplementedError # pragma: nocover
|
64 |
+
|
65 |
+
|
66 |
+
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
67 |
+
def __init__(self) -> None:
|
68 |
+
self._punctuation_count: int = 0
|
69 |
+
self._symbol_count: int = 0
|
70 |
+
self._character_count: int = 0
|
71 |
+
|
72 |
+
self._last_printable_char: str | None = None
|
73 |
+
self._frenzy_symbol_in_word: bool = False
|
74 |
+
|
75 |
+
def eligible(self, character: str) -> bool:
|
76 |
+
return character.isprintable()
|
77 |
+
|
78 |
+
def feed(self, character: str) -> None:
|
79 |
+
self._character_count += 1
|
80 |
+
|
81 |
+
if (
|
82 |
+
character != self._last_printable_char
|
83 |
+
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
84 |
+
):
|
85 |
+
if is_punctuation(character):
|
86 |
+
self._punctuation_count += 1
|
87 |
+
elif (
|
88 |
+
character.isdigit() is False
|
89 |
+
and is_symbol(character)
|
90 |
+
and is_emoticon(character) is False
|
91 |
+
):
|
92 |
+
self._symbol_count += 2
|
93 |
+
|
94 |
+
self._last_printable_char = character
|
95 |
+
|
96 |
+
def reset(self) -> None: # Abstract
|
97 |
+
self._punctuation_count = 0
|
98 |
+
self._character_count = 0
|
99 |
+
self._symbol_count = 0
|
100 |
+
|
101 |
+
@property
|
102 |
+
def ratio(self) -> float:
|
103 |
+
if self._character_count == 0:
|
104 |
+
return 0.0
|
105 |
+
|
106 |
+
ratio_of_punctuation: float = (
|
107 |
+
self._punctuation_count + self._symbol_count
|
108 |
+
) / self._character_count
|
109 |
+
|
110 |
+
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
111 |
+
|
112 |
+
|
113 |
+
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
114 |
+
def __init__(self) -> None:
|
115 |
+
self._character_count: int = 0
|
116 |
+
self._accentuated_count: int = 0
|
117 |
+
|
118 |
+
def eligible(self, character: str) -> bool:
|
119 |
+
return character.isalpha()
|
120 |
+
|
121 |
+
def feed(self, character: str) -> None:
|
122 |
+
self._character_count += 1
|
123 |
+
|
124 |
+
if is_accentuated(character):
|
125 |
+
self._accentuated_count += 1
|
126 |
+
|
127 |
+
def reset(self) -> None: # Abstract
|
128 |
+
self._character_count = 0
|
129 |
+
self._accentuated_count = 0
|
130 |
+
|
131 |
+
@property
|
132 |
+
def ratio(self) -> float:
|
133 |
+
if self._character_count < 8:
|
134 |
+
return 0.0
|
135 |
+
|
136 |
+
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
137 |
+
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
138 |
+
|
139 |
+
|
140 |
+
class UnprintablePlugin(MessDetectorPlugin):
|
141 |
+
def __init__(self) -> None:
|
142 |
+
self._unprintable_count: int = 0
|
143 |
+
self._character_count: int = 0
|
144 |
+
|
145 |
+
def eligible(self, character: str) -> bool:
|
146 |
+
return True
|
147 |
+
|
148 |
+
def feed(self, character: str) -> None:
|
149 |
+
if is_unprintable(character):
|
150 |
+
self._unprintable_count += 1
|
151 |
+
self._character_count += 1
|
152 |
+
|
153 |
+
def reset(self) -> None: # Abstract
|
154 |
+
self._unprintable_count = 0
|
155 |
+
|
156 |
+
@property
|
157 |
+
def ratio(self) -> float:
|
158 |
+
if self._character_count == 0:
|
159 |
+
return 0.0
|
160 |
+
|
161 |
+
return (self._unprintable_count * 8) / self._character_count
|
162 |
+
|
163 |
+
|
164 |
+
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
165 |
+
def __init__(self) -> None:
|
166 |
+
self._successive_count: int = 0
|
167 |
+
self._character_count: int = 0
|
168 |
+
|
169 |
+
self._last_latin_character: str | None = None
|
170 |
+
|
171 |
+
def eligible(self, character: str) -> bool:
|
172 |
+
return character.isalpha() and is_latin(character)
|
173 |
+
|
174 |
+
def feed(self, character: str) -> None:
|
175 |
+
self._character_count += 1
|
176 |
+
if (
|
177 |
+
self._last_latin_character is not None
|
178 |
+
and is_accentuated(character)
|
179 |
+
and is_accentuated(self._last_latin_character)
|
180 |
+
):
|
181 |
+
if character.isupper() and self._last_latin_character.isupper():
|
182 |
+
self._successive_count += 1
|
183 |
+
# Worse if its the same char duplicated with different accent.
|
184 |
+
if remove_accent(character) == remove_accent(self._last_latin_character):
|
185 |
+
self._successive_count += 1
|
186 |
+
self._last_latin_character = character
|
187 |
+
|
188 |
+
def reset(self) -> None: # Abstract
|
189 |
+
self._successive_count = 0
|
190 |
+
self._character_count = 0
|
191 |
+
self._last_latin_character = None
|
192 |
+
|
193 |
+
@property
|
194 |
+
def ratio(self) -> float:
|
195 |
+
if self._character_count == 0:
|
196 |
+
return 0.0
|
197 |
+
|
198 |
+
return (self._successive_count * 2) / self._character_count
|
199 |
+
|
200 |
+
|
201 |
+
class SuspiciousRange(MessDetectorPlugin):
|
202 |
+
def __init__(self) -> None:
|
203 |
+
self._suspicious_successive_range_count: int = 0
|
204 |
+
self._character_count: int = 0
|
205 |
+
self._last_printable_seen: str | None = None
|
206 |
+
|
207 |
+
def eligible(self, character: str) -> bool:
|
208 |
+
return character.isprintable()
|
209 |
+
|
210 |
+
def feed(self, character: str) -> None:
|
211 |
+
self._character_count += 1
|
212 |
+
|
213 |
+
if (
|
214 |
+
character.isspace()
|
215 |
+
or is_punctuation(character)
|
216 |
+
or character in COMMON_SAFE_ASCII_CHARACTERS
|
217 |
+
):
|
218 |
+
self._last_printable_seen = None
|
219 |
+
return
|
220 |
+
|
221 |
+
if self._last_printable_seen is None:
|
222 |
+
self._last_printable_seen = character
|
223 |
+
return
|
224 |
+
|
225 |
+
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
226 |
+
unicode_range_b: str | None = unicode_range(character)
|
227 |
+
|
228 |
+
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
229 |
+
self._suspicious_successive_range_count += 1
|
230 |
+
|
231 |
+
self._last_printable_seen = character
|
232 |
+
|
233 |
+
def reset(self) -> None: # Abstract
|
234 |
+
self._character_count = 0
|
235 |
+
self._suspicious_successive_range_count = 0
|
236 |
+
self._last_printable_seen = None
|
237 |
+
|
238 |
+
@property
|
239 |
+
def ratio(self) -> float:
|
240 |
+
if self._character_count <= 13:
|
241 |
+
return 0.0
|
242 |
+
|
243 |
+
ratio_of_suspicious_range_usage: float = (
|
244 |
+
self._suspicious_successive_range_count * 2
|
245 |
+
) / self._character_count
|
246 |
+
|
247 |
+
return ratio_of_suspicious_range_usage
|
248 |
+
|
249 |
+
|
250 |
+
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
251 |
+
def __init__(self) -> None:
|
252 |
+
self._word_count: int = 0
|
253 |
+
self._bad_word_count: int = 0
|
254 |
+
self._foreign_long_count: int = 0
|
255 |
+
|
256 |
+
self._is_current_word_bad: bool = False
|
257 |
+
self._foreign_long_watch: bool = False
|
258 |
+
|
259 |
+
self._character_count: int = 0
|
260 |
+
self._bad_character_count: int = 0
|
261 |
+
|
262 |
+
self._buffer: str = ""
|
263 |
+
self._buffer_accent_count: int = 0
|
264 |
+
self._buffer_glyph_count: int = 0
|
265 |
+
|
266 |
+
def eligible(self, character: str) -> bool:
|
267 |
+
return True
|
268 |
+
|
269 |
+
def feed(self, character: str) -> None:
|
270 |
+
if character.isalpha():
|
271 |
+
self._buffer += character
|
272 |
+
if is_accentuated(character):
|
273 |
+
self._buffer_accent_count += 1
|
274 |
+
if (
|
275 |
+
self._foreign_long_watch is False
|
276 |
+
and (is_latin(character) is False or is_accentuated(character))
|
277 |
+
and is_cjk(character) is False
|
278 |
+
and is_hangul(character) is False
|
279 |
+
and is_katakana(character) is False
|
280 |
+
and is_hiragana(character) is False
|
281 |
+
and is_thai(character) is False
|
282 |
+
):
|
283 |
+
self._foreign_long_watch = True
|
284 |
+
if (
|
285 |
+
is_cjk(character)
|
286 |
+
or is_hangul(character)
|
287 |
+
or is_katakana(character)
|
288 |
+
or is_hiragana(character)
|
289 |
+
or is_thai(character)
|
290 |
+
):
|
291 |
+
self._buffer_glyph_count += 1
|
292 |
+
return
|
293 |
+
if not self._buffer:
|
294 |
+
return
|
295 |
+
if (
|
296 |
+
character.isspace() or is_punctuation(character) or is_separator(character)
|
297 |
+
) and self._buffer:
|
298 |
+
self._word_count += 1
|
299 |
+
buffer_length: int = len(self._buffer)
|
300 |
+
|
301 |
+
self._character_count += buffer_length
|
302 |
+
|
303 |
+
if buffer_length >= 4:
|
304 |
+
if self._buffer_accent_count / buffer_length >= 0.5:
|
305 |
+
self._is_current_word_bad = True
|
306 |
+
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
307 |
+
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
308 |
+
elif (
|
309 |
+
is_accentuated(self._buffer[-1])
|
310 |
+
and self._buffer[-1].isupper()
|
311 |
+
and all(_.isupper() for _ in self._buffer) is False
|
312 |
+
):
|
313 |
+
self._foreign_long_count += 1
|
314 |
+
self._is_current_word_bad = True
|
315 |
+
elif self._buffer_glyph_count == 1:
|
316 |
+
self._is_current_word_bad = True
|
317 |
+
self._foreign_long_count += 1
|
318 |
+
if buffer_length >= 24 and self._foreign_long_watch:
|
319 |
+
camel_case_dst = [
|
320 |
+
i
|
321 |
+
for c, i in zip(self._buffer, range(0, buffer_length))
|
322 |
+
if c.isupper()
|
323 |
+
]
|
324 |
+
probable_camel_cased: bool = False
|
325 |
+
|
326 |
+
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
327 |
+
probable_camel_cased = True
|
328 |
+
|
329 |
+
if not probable_camel_cased:
|
330 |
+
self._foreign_long_count += 1
|
331 |
+
self._is_current_word_bad = True
|
332 |
+
|
333 |
+
if self._is_current_word_bad:
|
334 |
+
self._bad_word_count += 1
|
335 |
+
self._bad_character_count += len(self._buffer)
|
336 |
+
self._is_current_word_bad = False
|
337 |
+
|
338 |
+
self._foreign_long_watch = False
|
339 |
+
self._buffer = ""
|
340 |
+
self._buffer_accent_count = 0
|
341 |
+
self._buffer_glyph_count = 0
|
342 |
+
elif (
|
343 |
+
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
344 |
+
and character.isdigit() is False
|
345 |
+
and is_symbol(character)
|
346 |
+
):
|
347 |
+
self._is_current_word_bad = True
|
348 |
+
self._buffer += character
|
349 |
+
|
350 |
+
def reset(self) -> None: # Abstract
|
351 |
+
self._buffer = ""
|
352 |
+
self._is_current_word_bad = False
|
353 |
+
self._foreign_long_watch = False
|
354 |
+
self._bad_word_count = 0
|
355 |
+
self._word_count = 0
|
356 |
+
self._character_count = 0
|
357 |
+
self._bad_character_count = 0
|
358 |
+
self._foreign_long_count = 0
|
359 |
+
|
360 |
+
@property
|
361 |
+
def ratio(self) -> float:
|
362 |
+
if self._word_count <= 10 and self._foreign_long_count == 0:
|
363 |
+
return 0.0
|
364 |
+
|
365 |
+
return self._bad_character_count / self._character_count
|
366 |
+
|
367 |
+
|
368 |
+
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
369 |
+
"""
|
370 |
+
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
371 |
+
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
372 |
+
"""
|
373 |
+
|
374 |
+
def __init__(self) -> None:
|
375 |
+
self._wrong_stop_count: int = 0
|
376 |
+
self._cjk_character_count: int = 0
|
377 |
+
|
378 |
+
def eligible(self, character: str) -> bool:
|
379 |
+
return True
|
380 |
+
|
381 |
+
def feed(self, character: str) -> None:
|
382 |
+
if character in {"丅", "丄"}:
|
383 |
+
self._wrong_stop_count += 1
|
384 |
+
return
|
385 |
+
if is_cjk(character):
|
386 |
+
self._cjk_character_count += 1
|
387 |
+
|
388 |
+
def reset(self) -> None: # Abstract
|
389 |
+
self._wrong_stop_count = 0
|
390 |
+
self._cjk_character_count = 0
|
391 |
+
|
392 |
+
@property
|
393 |
+
def ratio(self) -> float:
|
394 |
+
if self._cjk_character_count < 16:
|
395 |
+
return 0.0
|
396 |
+
return self._wrong_stop_count / self._cjk_character_count
|
397 |
+
|
398 |
+
|
399 |
+
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
400 |
+
def __init__(self) -> None:
|
401 |
+
self._buf: bool = False
|
402 |
+
|
403 |
+
self._character_count_since_last_sep: int = 0
|
404 |
+
|
405 |
+
self._successive_upper_lower_count: int = 0
|
406 |
+
self._successive_upper_lower_count_final: int = 0
|
407 |
+
|
408 |
+
self._character_count: int = 0
|
409 |
+
|
410 |
+
self._last_alpha_seen: str | None = None
|
411 |
+
self._current_ascii_only: bool = True
|
412 |
+
|
413 |
+
def eligible(self, character: str) -> bool:
|
414 |
+
return True
|
415 |
+
|
416 |
+
def feed(self, character: str) -> None:
|
417 |
+
is_concerned = character.isalpha() and is_case_variable(character)
|
418 |
+
chunk_sep = is_concerned is False
|
419 |
+
|
420 |
+
if chunk_sep and self._character_count_since_last_sep > 0:
|
421 |
+
if (
|
422 |
+
self._character_count_since_last_sep <= 64
|
423 |
+
and character.isdigit() is False
|
424 |
+
and self._current_ascii_only is False
|
425 |
+
):
|
426 |
+
self._successive_upper_lower_count_final += (
|
427 |
+
self._successive_upper_lower_count
|
428 |
+
)
|
429 |
+
|
430 |
+
self._successive_upper_lower_count = 0
|
431 |
+
self._character_count_since_last_sep = 0
|
432 |
+
self._last_alpha_seen = None
|
433 |
+
self._buf = False
|
434 |
+
self._character_count += 1
|
435 |
+
self._current_ascii_only = True
|
436 |
+
|
437 |
+
return
|
438 |
+
|
439 |
+
if self._current_ascii_only is True and character.isascii() is False:
|
440 |
+
self._current_ascii_only = False
|
441 |
+
|
442 |
+
if self._last_alpha_seen is not None:
|
443 |
+
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
444 |
+
character.islower() and self._last_alpha_seen.isupper()
|
445 |
+
):
|
446 |
+
if self._buf is True:
|
447 |
+
self._successive_upper_lower_count += 2
|
448 |
+
self._buf = False
|
449 |
+
else:
|
450 |
+
self._buf = True
|
451 |
+
else:
|
452 |
+
self._buf = False
|
453 |
+
|
454 |
+
self._character_count += 1
|
455 |
+
self._character_count_since_last_sep += 1
|
456 |
+
self._last_alpha_seen = character
|
457 |
+
|
458 |
+
def reset(self) -> None: # Abstract
|
459 |
+
self._character_count = 0
|
460 |
+
self._character_count_since_last_sep = 0
|
461 |
+
self._successive_upper_lower_count = 0
|
462 |
+
self._successive_upper_lower_count_final = 0
|
463 |
+
self._last_alpha_seen = None
|
464 |
+
self._buf = False
|
465 |
+
self._current_ascii_only = True
|
466 |
+
|
467 |
+
@property
|
468 |
+
def ratio(self) -> float:
|
469 |
+
if self._character_count == 0:
|
470 |
+
return 0.0
|
471 |
+
|
472 |
+
return self._successive_upper_lower_count_final / self._character_count
|
473 |
+
|
474 |
+
|
475 |
+
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
476 |
+
def __init__(self) -> None:
|
477 |
+
self._character_count: int = 0
|
478 |
+
self._isolated_form_count: int = 0
|
479 |
+
|
480 |
+
def reset(self) -> None: # Abstract
|
481 |
+
self._character_count = 0
|
482 |
+
self._isolated_form_count = 0
|
483 |
+
|
484 |
+
def eligible(self, character: str) -> bool:
|
485 |
+
return is_arabic(character)
|
486 |
+
|
487 |
+
def feed(self, character: str) -> None:
|
488 |
+
self._character_count += 1
|
489 |
+
|
490 |
+
if is_arabic_isolated_form(character):
|
491 |
+
self._isolated_form_count += 1
|
492 |
+
|
493 |
+
@property
|
494 |
+
def ratio(self) -> float:
|
495 |
+
if self._character_count < 8:
|
496 |
+
return 0.0
|
497 |
+
|
498 |
+
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
499 |
+
|
500 |
+
return isolated_form_usage
|
501 |
+
|
502 |
+
|
503 |
+
@lru_cache(maxsize=1024)
|
504 |
+
def is_suspiciously_successive_range(
|
505 |
+
unicode_range_a: str | None, unicode_range_b: str | None
|
506 |
+
) -> bool:
|
507 |
+
"""
|
508 |
+
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
509 |
+
"""
|
510 |
+
if unicode_range_a is None or unicode_range_b is None:
|
511 |
+
return True
|
512 |
+
|
513 |
+
if unicode_range_a == unicode_range_b:
|
514 |
+
return False
|
515 |
+
|
516 |
+
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
517 |
+
return False
|
518 |
+
|
519 |
+
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
520 |
+
return False
|
521 |
+
|
522 |
+
# Latin characters can be accompanied with a combining diacritical mark
|
523 |
+
# eg. Vietnamese.
|
524 |
+
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
525 |
+
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
526 |
+
):
|
527 |
+
return False
|
528 |
+
|
529 |
+
keywords_range_a, keywords_range_b = (
|
530 |
+
unicode_range_a.split(" "),
|
531 |
+
unicode_range_b.split(" "),
|
532 |
+
)
|
533 |
+
|
534 |
+
for el in keywords_range_a:
|
535 |
+
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
536 |
+
continue
|
537 |
+
if el in keywords_range_b:
|
538 |
+
return False
|
539 |
+
|
540 |
+
# Japanese Exception
|
541 |
+
range_a_jp_chars, range_b_jp_chars = (
|
542 |
+
unicode_range_a
|
543 |
+
in (
|
544 |
+
"Hiragana",
|
545 |
+
"Katakana",
|
546 |
+
),
|
547 |
+
unicode_range_b in ("Hiragana", "Katakana"),
|
548 |
+
)
|
549 |
+
if (range_a_jp_chars or range_b_jp_chars) and (
|
550 |
+
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
551 |
+
):
|
552 |
+
return False
|
553 |
+
if range_a_jp_chars and range_b_jp_chars:
|
554 |
+
return False
|
555 |
+
|
556 |
+
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
557 |
+
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
558 |
+
return False
|
559 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
560 |
+
return False
|
561 |
+
|
562 |
+
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
563 |
+
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
564 |
+
unicode_range_a in ["Katakana", "Hiragana"]
|
565 |
+
and unicode_range_b in ["Katakana", "Hiragana"]
|
566 |
+
):
|
567 |
+
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
568 |
+
return False
|
569 |
+
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
570 |
+
return False
|
571 |
+
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
572 |
+
return False
|
573 |
+
|
574 |
+
return True
|
575 |
+
|
576 |
+
|
577 |
+
@lru_cache(maxsize=2048)
|
578 |
+
def mess_ratio(
|
579 |
+
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
580 |
+
) -> float:
|
581 |
+
"""
|
582 |
+
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
583 |
+
"""
|
584 |
+
|
585 |
+
detectors: list[MessDetectorPlugin] = [
|
586 |
+
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
587 |
+
]
|
588 |
+
|
589 |
+
length: int = len(decoded_sequence) + 1
|
590 |
+
|
591 |
+
mean_mess_ratio: float = 0.0
|
592 |
+
|
593 |
+
if length < 512:
|
594 |
+
intermediary_mean_mess_ratio_calc: int = 32
|
595 |
+
elif length <= 1024:
|
596 |
+
intermediary_mean_mess_ratio_calc = 64
|
597 |
+
else:
|
598 |
+
intermediary_mean_mess_ratio_calc = 128
|
599 |
+
|
600 |
+
for character, index in zip(decoded_sequence + "\n", range(length)):
|
601 |
+
for detector in detectors:
|
602 |
+
if detector.eligible(character):
|
603 |
+
detector.feed(character)
|
604 |
+
|
605 |
+
if (
|
606 |
+
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
607 |
+
) or index == length - 1:
|
608 |
+
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
609 |
+
|
610 |
+
if mean_mess_ratio >= maximum_threshold:
|
611 |
+
break
|
612 |
+
|
613 |
+
if debug:
|
614 |
+
logger = getLogger("charset_normalizer")
|
615 |
+
|
616 |
+
logger.log(
|
617 |
+
TRACE,
|
618 |
+
"Mess-detector extended-analysis start. "
|
619 |
+
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
620 |
+
f"maximum_threshold={maximum_threshold}",
|
621 |
+
)
|
622 |
+
|
623 |
+
if len(decoded_sequence) > 16:
|
624 |
+
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
625 |
+
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
626 |
+
|
627 |
+
for dt in detectors:
|
628 |
+
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
629 |
+
|
630 |
+
return round(mean_mess_ratio, 3)
|
env/Lib/site-packages/charset_normalizer/models.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from encodings.aliases import aliases
|
4 |
+
from hashlib import sha256
|
5 |
+
from json import dumps
|
6 |
+
from re import sub
|
7 |
+
from typing import Any, Iterator, List, Tuple
|
8 |
+
|
9 |
+
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
10 |
+
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
11 |
+
|
12 |
+
|
13 |
+
class CharsetMatch:
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
payload: bytes,
|
17 |
+
guessed_encoding: str,
|
18 |
+
mean_mess_ratio: float,
|
19 |
+
has_sig_or_bom: bool,
|
20 |
+
languages: CoherenceMatches,
|
21 |
+
decoded_payload: str | None = None,
|
22 |
+
preemptive_declaration: str | None = None,
|
23 |
+
):
|
24 |
+
self._payload: bytes = payload
|
25 |
+
|
26 |
+
self._encoding: str = guessed_encoding
|
27 |
+
self._mean_mess_ratio: float = mean_mess_ratio
|
28 |
+
self._languages: CoherenceMatches = languages
|
29 |
+
self._has_sig_or_bom: bool = has_sig_or_bom
|
30 |
+
self._unicode_ranges: list[str] | None = None
|
31 |
+
|
32 |
+
self._leaves: list[CharsetMatch] = []
|
33 |
+
self._mean_coherence_ratio: float = 0.0
|
34 |
+
|
35 |
+
self._output_payload: bytes | None = None
|
36 |
+
self._output_encoding: str | None = None
|
37 |
+
|
38 |
+
self._string: str | None = decoded_payload
|
39 |
+
|
40 |
+
self._preemptive_declaration: str | None = preemptive_declaration
|
41 |
+
|
42 |
+
def __eq__(self, other: object) -> bool:
|
43 |
+
if not isinstance(other, CharsetMatch):
|
44 |
+
if isinstance(other, str):
|
45 |
+
return iana_name(other) == self.encoding
|
46 |
+
return False
|
47 |
+
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
48 |
+
|
49 |
+
def __lt__(self, other: object) -> bool:
|
50 |
+
"""
|
51 |
+
Implemented to make sorted available upon CharsetMatches items.
|
52 |
+
"""
|
53 |
+
if not isinstance(other, CharsetMatch):
|
54 |
+
raise ValueError
|
55 |
+
|
56 |
+
chaos_difference: float = abs(self.chaos - other.chaos)
|
57 |
+
coherence_difference: float = abs(self.coherence - other.coherence)
|
58 |
+
|
59 |
+
# Below 1% difference --> Use Coherence
|
60 |
+
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
61 |
+
return self.coherence > other.coherence
|
62 |
+
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
63 |
+
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
64 |
+
# preserve RAM usage!
|
65 |
+
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
66 |
+
return self.chaos < other.chaos
|
67 |
+
return self.multi_byte_usage > other.multi_byte_usage
|
68 |
+
|
69 |
+
return self.chaos < other.chaos
|
70 |
+
|
71 |
+
@property
|
72 |
+
def multi_byte_usage(self) -> float:
|
73 |
+
return 1.0 - (len(str(self)) / len(self.raw))
|
74 |
+
|
75 |
+
def __str__(self) -> str:
|
76 |
+
# Lazy Str Loading
|
77 |
+
if self._string is None:
|
78 |
+
self._string = str(self._payload, self._encoding, "strict")
|
79 |
+
return self._string
|
80 |
+
|
81 |
+
def __repr__(self) -> str:
|
82 |
+
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
83 |
+
|
84 |
+
def add_submatch(self, other: CharsetMatch) -> None:
|
85 |
+
if not isinstance(other, CharsetMatch) or other == self:
|
86 |
+
raise ValueError(
|
87 |
+
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
88 |
+
other.__class__
|
89 |
+
)
|
90 |
+
)
|
91 |
+
|
92 |
+
other._string = None # Unload RAM usage; dirty trick.
|
93 |
+
self._leaves.append(other)
|
94 |
+
|
95 |
+
@property
|
96 |
+
def encoding(self) -> str:
|
97 |
+
return self._encoding
|
98 |
+
|
99 |
+
@property
|
100 |
+
def encoding_aliases(self) -> list[str]:
|
101 |
+
"""
|
102 |
+
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
103 |
+
"""
|
104 |
+
also_known_as: list[str] = []
|
105 |
+
for u, p in aliases.items():
|
106 |
+
if self.encoding == u:
|
107 |
+
also_known_as.append(p)
|
108 |
+
elif self.encoding == p:
|
109 |
+
also_known_as.append(u)
|
110 |
+
return also_known_as
|
111 |
+
|
112 |
+
@property
|
113 |
+
def bom(self) -> bool:
|
114 |
+
return self._has_sig_or_bom
|
115 |
+
|
116 |
+
@property
|
117 |
+
def byte_order_mark(self) -> bool:
|
118 |
+
return self._has_sig_or_bom
|
119 |
+
|
120 |
+
@property
|
121 |
+
def languages(self) -> list[str]:
|
122 |
+
"""
|
123 |
+
Return the complete list of possible languages found in decoded sequence.
|
124 |
+
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
125 |
+
"""
|
126 |
+
return [e[0] for e in self._languages]
|
127 |
+
|
128 |
+
@property
|
129 |
+
def language(self) -> str:
|
130 |
+
"""
|
131 |
+
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
132 |
+
"Unknown".
|
133 |
+
"""
|
134 |
+
if not self._languages:
|
135 |
+
# Trying to infer the language based on the given encoding
|
136 |
+
# Its either English or we should not pronounce ourselves in certain cases.
|
137 |
+
if "ascii" in self.could_be_from_charset:
|
138 |
+
return "English"
|
139 |
+
|
140 |
+
# doing it there to avoid circular import
|
141 |
+
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
142 |
+
|
143 |
+
languages = (
|
144 |
+
mb_encoding_languages(self.encoding)
|
145 |
+
if is_multi_byte_encoding(self.encoding)
|
146 |
+
else encoding_languages(self.encoding)
|
147 |
+
)
|
148 |
+
|
149 |
+
if len(languages) == 0 or "Latin Based" in languages:
|
150 |
+
return "Unknown"
|
151 |
+
|
152 |
+
return languages[0]
|
153 |
+
|
154 |
+
return self._languages[0][0]
|
155 |
+
|
156 |
+
@property
|
157 |
+
def chaos(self) -> float:
|
158 |
+
return self._mean_mess_ratio
|
159 |
+
|
160 |
+
@property
|
161 |
+
def coherence(self) -> float:
|
162 |
+
if not self._languages:
|
163 |
+
return 0.0
|
164 |
+
return self._languages[0][1]
|
165 |
+
|
166 |
+
@property
|
167 |
+
def percent_chaos(self) -> float:
|
168 |
+
return round(self.chaos * 100, ndigits=3)
|
169 |
+
|
170 |
+
@property
|
171 |
+
def percent_coherence(self) -> float:
|
172 |
+
return round(self.coherence * 100, ndigits=3)
|
173 |
+
|
174 |
+
@property
|
175 |
+
def raw(self) -> bytes:
|
176 |
+
"""
|
177 |
+
Original untouched bytes.
|
178 |
+
"""
|
179 |
+
return self._payload
|
180 |
+
|
181 |
+
@property
|
182 |
+
def submatch(self) -> list[CharsetMatch]:
|
183 |
+
return self._leaves
|
184 |
+
|
185 |
+
@property
|
186 |
+
def has_submatch(self) -> bool:
|
187 |
+
return len(self._leaves) > 0
|
188 |
+
|
189 |
+
@property
|
190 |
+
def alphabets(self) -> list[str]:
|
191 |
+
if self._unicode_ranges is not None:
|
192 |
+
return self._unicode_ranges
|
193 |
+
# list detected ranges
|
194 |
+
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
195 |
+
# filter and sort
|
196 |
+
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
197 |
+
return self._unicode_ranges
|
198 |
+
|
199 |
+
@property
|
200 |
+
def could_be_from_charset(self) -> list[str]:
|
201 |
+
"""
|
202 |
+
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
203 |
+
encoding.
|
204 |
+
This list does include the encoding available in property 'encoding'.
|
205 |
+
"""
|
206 |
+
return [self._encoding] + [m.encoding for m in self._leaves]
|
207 |
+
|
208 |
+
def output(self, encoding: str = "utf_8") -> bytes:
|
209 |
+
"""
|
210 |
+
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
211 |
+
Any errors will be simply ignored by the encoder NOT replaced.
|
212 |
+
"""
|
213 |
+
if self._output_encoding is None or self._output_encoding != encoding:
|
214 |
+
self._output_encoding = encoding
|
215 |
+
decoded_string = str(self)
|
216 |
+
if (
|
217 |
+
self._preemptive_declaration is not None
|
218 |
+
and self._preemptive_declaration.lower()
|
219 |
+
not in ["utf-8", "utf8", "utf_8"]
|
220 |
+
):
|
221 |
+
patched_header = sub(
|
222 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
223 |
+
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
224 |
+
m.groups()[0],
|
225 |
+
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
226 |
+
),
|
227 |
+
decoded_string[:8192],
|
228 |
+
count=1,
|
229 |
+
)
|
230 |
+
|
231 |
+
decoded_string = patched_header + decoded_string[8192:]
|
232 |
+
|
233 |
+
self._output_payload = decoded_string.encode(encoding, "replace")
|
234 |
+
|
235 |
+
return self._output_payload # type: ignore
|
236 |
+
|
237 |
+
@property
|
238 |
+
def fingerprint(self) -> str:
|
239 |
+
"""
|
240 |
+
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
241 |
+
"""
|
242 |
+
return sha256(self.output()).hexdigest()
|
243 |
+
|
244 |
+
|
245 |
+
class CharsetMatches:
|
246 |
+
"""
|
247 |
+
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
248 |
+
Act like a list(iterable) but does not implements all related methods.
|
249 |
+
"""
|
250 |
+
|
251 |
+
def __init__(self, results: list[CharsetMatch] | None = None):
|
252 |
+
self._results: list[CharsetMatch] = sorted(results) if results else []
|
253 |
+
|
254 |
+
def __iter__(self) -> Iterator[CharsetMatch]:
|
255 |
+
yield from self._results
|
256 |
+
|
257 |
+
def __getitem__(self, item: int | str) -> CharsetMatch:
|
258 |
+
"""
|
259 |
+
Retrieve a single item either by its position or encoding name (alias may be used here).
|
260 |
+
Raise KeyError upon invalid index or encoding not present in results.
|
261 |
+
"""
|
262 |
+
if isinstance(item, int):
|
263 |
+
return self._results[item]
|
264 |
+
if isinstance(item, str):
|
265 |
+
item = iana_name(item, False)
|
266 |
+
for result in self._results:
|
267 |
+
if item in result.could_be_from_charset:
|
268 |
+
return result
|
269 |
+
raise KeyError
|
270 |
+
|
271 |
+
def __len__(self) -> int:
|
272 |
+
return len(self._results)
|
273 |
+
|
274 |
+
def __bool__(self) -> bool:
|
275 |
+
return len(self._results) > 0
|
276 |
+
|
277 |
+
def append(self, item: CharsetMatch) -> None:
|
278 |
+
"""
|
279 |
+
Insert a single match. Will be inserted accordingly to preserve sort.
|
280 |
+
Can be inserted as a submatch.
|
281 |
+
"""
|
282 |
+
if not isinstance(item, CharsetMatch):
|
283 |
+
raise ValueError(
|
284 |
+
"Cannot append instance '{}' to CharsetMatches".format(
|
285 |
+
str(item.__class__)
|
286 |
+
)
|
287 |
+
)
|
288 |
+
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
289 |
+
if len(item.raw) < TOO_BIG_SEQUENCE:
|
290 |
+
for match in self._results:
|
291 |
+
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
292 |
+
match.add_submatch(item)
|
293 |
+
return
|
294 |
+
self._results.append(item)
|
295 |
+
self._results = sorted(self._results)
|
296 |
+
|
297 |
+
def best(self) -> CharsetMatch | None:
|
298 |
+
"""
|
299 |
+
Simply return the first match. Strict equivalent to matches[0].
|
300 |
+
"""
|
301 |
+
if not self._results:
|
302 |
+
return None
|
303 |
+
return self._results[0]
|
304 |
+
|
305 |
+
def first(self) -> CharsetMatch | None:
|
306 |
+
"""
|
307 |
+
Redundant method, call the method best(). Kept for BC reasons.
|
308 |
+
"""
|
309 |
+
return self.best()
|
310 |
+
|
311 |
+
|
312 |
+
CoherenceMatch = Tuple[str, float]
|
313 |
+
CoherenceMatches = List[CoherenceMatch]
|
314 |
+
|
315 |
+
|
316 |
+
class CliDetectionResult:
|
317 |
+
def __init__(
|
318 |
+
self,
|
319 |
+
path: str,
|
320 |
+
encoding: str | None,
|
321 |
+
encoding_aliases: list[str],
|
322 |
+
alternative_encodings: list[str],
|
323 |
+
language: str,
|
324 |
+
alphabets: list[str],
|
325 |
+
has_sig_or_bom: bool,
|
326 |
+
chaos: float,
|
327 |
+
coherence: float,
|
328 |
+
unicode_path: str | None,
|
329 |
+
is_preferred: bool,
|
330 |
+
):
|
331 |
+
self.path: str = path
|
332 |
+
self.unicode_path: str | None = unicode_path
|
333 |
+
self.encoding: str | None = encoding
|
334 |
+
self.encoding_aliases: list[str] = encoding_aliases
|
335 |
+
self.alternative_encodings: list[str] = alternative_encodings
|
336 |
+
self.language: str = language
|
337 |
+
self.alphabets: list[str] = alphabets
|
338 |
+
self.has_sig_or_bom: bool = has_sig_or_bom
|
339 |
+
self.chaos: float = chaos
|
340 |
+
self.coherence: float = coherence
|
341 |
+
self.is_preferred: bool = is_preferred
|
342 |
+
|
343 |
+
@property
|
344 |
+
def __dict__(self) -> dict[str, Any]: # type: ignore
|
345 |
+
return {
|
346 |
+
"path": self.path,
|
347 |
+
"encoding": self.encoding,
|
348 |
+
"encoding_aliases": self.encoding_aliases,
|
349 |
+
"alternative_encodings": self.alternative_encodings,
|
350 |
+
"language": self.language,
|
351 |
+
"alphabets": self.alphabets,
|
352 |
+
"has_sig_or_bom": self.has_sig_or_bom,
|
353 |
+
"chaos": self.chaos,
|
354 |
+
"coherence": self.coherence,
|
355 |
+
"unicode_path": self.unicode_path,
|
356 |
+
"is_preferred": self.is_preferred,
|
357 |
+
}
|
358 |
+
|
359 |
+
def to_json(self) -> str:
|
360 |
+
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
env/Lib/site-packages/charset_normalizer/py.typed
ADDED
File without changes
|
env/Lib/site-packages/charset_normalizer/utils.py
ADDED
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import importlib
|
4 |
+
import logging
|
5 |
+
import unicodedata
|
6 |
+
from codecs import IncrementalDecoder
|
7 |
+
from encodings.aliases import aliases
|
8 |
+
from functools import lru_cache
|
9 |
+
from re import findall
|
10 |
+
from typing import Generator
|
11 |
+
|
12 |
+
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
13 |
+
MultibyteIncrementalDecoder,
|
14 |
+
)
|
15 |
+
|
16 |
+
from .constant import (
|
17 |
+
ENCODING_MARKS,
|
18 |
+
IANA_SUPPORTED_SIMILAR,
|
19 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
20 |
+
UNICODE_RANGES_COMBINED,
|
21 |
+
UNICODE_SECONDARY_RANGE_KEYWORD,
|
22 |
+
UTF8_MAXIMAL_ALLOCATION,
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
27 |
+
def is_accentuated(character: str) -> bool:
|
28 |
+
try:
|
29 |
+
description: str = unicodedata.name(character)
|
30 |
+
except ValueError: # Defensive: unicode database outdated?
|
31 |
+
return False
|
32 |
+
return (
|
33 |
+
"WITH GRAVE" in description
|
34 |
+
or "WITH ACUTE" in description
|
35 |
+
or "WITH CEDILLA" in description
|
36 |
+
or "WITH DIAERESIS" in description
|
37 |
+
or "WITH CIRCUMFLEX" in description
|
38 |
+
or "WITH TILDE" in description
|
39 |
+
or "WITH MACRON" in description
|
40 |
+
or "WITH RING ABOVE" in description
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
45 |
+
def remove_accent(character: str) -> str:
|
46 |
+
decomposed: str = unicodedata.decomposition(character)
|
47 |
+
if not decomposed:
|
48 |
+
return character
|
49 |
+
|
50 |
+
codes: list[str] = decomposed.split(" ")
|
51 |
+
|
52 |
+
return chr(int(codes[0], 16))
|
53 |
+
|
54 |
+
|
55 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
56 |
+
def unicode_range(character: str) -> str | None:
|
57 |
+
"""
|
58 |
+
Retrieve the Unicode range official name from a single character.
|
59 |
+
"""
|
60 |
+
character_ord: int = ord(character)
|
61 |
+
|
62 |
+
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
63 |
+
if character_ord in ord_range:
|
64 |
+
return range_name
|
65 |
+
|
66 |
+
return None
|
67 |
+
|
68 |
+
|
69 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
70 |
+
def is_latin(character: str) -> bool:
|
71 |
+
try:
|
72 |
+
description: str = unicodedata.name(character)
|
73 |
+
except ValueError: # Defensive: unicode database outdated?
|
74 |
+
return False
|
75 |
+
return "LATIN" in description
|
76 |
+
|
77 |
+
|
78 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
79 |
+
def is_punctuation(character: str) -> bool:
|
80 |
+
character_category: str = unicodedata.category(character)
|
81 |
+
|
82 |
+
if "P" in character_category:
|
83 |
+
return True
|
84 |
+
|
85 |
+
character_range: str | None = unicode_range(character)
|
86 |
+
|
87 |
+
if character_range is None:
|
88 |
+
return False
|
89 |
+
|
90 |
+
return "Punctuation" in character_range
|
91 |
+
|
92 |
+
|
93 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
94 |
+
def is_symbol(character: str) -> bool:
|
95 |
+
character_category: str = unicodedata.category(character)
|
96 |
+
|
97 |
+
if "S" in character_category or "N" in character_category:
|
98 |
+
return True
|
99 |
+
|
100 |
+
character_range: str | None = unicode_range(character)
|
101 |
+
|
102 |
+
if character_range is None:
|
103 |
+
return False
|
104 |
+
|
105 |
+
return "Forms" in character_range and character_category != "Lo"
|
106 |
+
|
107 |
+
|
108 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
109 |
+
def is_emoticon(character: str) -> bool:
|
110 |
+
character_range: str | None = unicode_range(character)
|
111 |
+
|
112 |
+
if character_range is None:
|
113 |
+
return False
|
114 |
+
|
115 |
+
return "Emoticons" in character_range or "Pictographs" in character_range
|
116 |
+
|
117 |
+
|
118 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
119 |
+
def is_separator(character: str) -> bool:
|
120 |
+
if character.isspace() or character in {"|", "+", "<", ">"}:
|
121 |
+
return True
|
122 |
+
|
123 |
+
character_category: str = unicodedata.category(character)
|
124 |
+
|
125 |
+
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
126 |
+
|
127 |
+
|
128 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
129 |
+
def is_case_variable(character: str) -> bool:
|
130 |
+
return character.islower() != character.isupper()
|
131 |
+
|
132 |
+
|
133 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
134 |
+
def is_cjk(character: str) -> bool:
|
135 |
+
try:
|
136 |
+
character_name = unicodedata.name(character)
|
137 |
+
except ValueError: # Defensive: unicode database outdated?
|
138 |
+
return False
|
139 |
+
|
140 |
+
return "CJK" in character_name
|
141 |
+
|
142 |
+
|
143 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
144 |
+
def is_hiragana(character: str) -> bool:
|
145 |
+
try:
|
146 |
+
character_name = unicodedata.name(character)
|
147 |
+
except ValueError: # Defensive: unicode database outdated?
|
148 |
+
return False
|
149 |
+
|
150 |
+
return "HIRAGANA" in character_name
|
151 |
+
|
152 |
+
|
153 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
154 |
+
def is_katakana(character: str) -> bool:
|
155 |
+
try:
|
156 |
+
character_name = unicodedata.name(character)
|
157 |
+
except ValueError: # Defensive: unicode database outdated?
|
158 |
+
return False
|
159 |
+
|
160 |
+
return "KATAKANA" in character_name
|
161 |
+
|
162 |
+
|
163 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
164 |
+
def is_hangul(character: str) -> bool:
|
165 |
+
try:
|
166 |
+
character_name = unicodedata.name(character)
|
167 |
+
except ValueError: # Defensive: unicode database outdated?
|
168 |
+
return False
|
169 |
+
|
170 |
+
return "HANGUL" in character_name
|
171 |
+
|
172 |
+
|
173 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
174 |
+
def is_thai(character: str) -> bool:
|
175 |
+
try:
|
176 |
+
character_name = unicodedata.name(character)
|
177 |
+
except ValueError: # Defensive: unicode database outdated?
|
178 |
+
return False
|
179 |
+
|
180 |
+
return "THAI" in character_name
|
181 |
+
|
182 |
+
|
183 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
184 |
+
def is_arabic(character: str) -> bool:
|
185 |
+
try:
|
186 |
+
character_name = unicodedata.name(character)
|
187 |
+
except ValueError: # Defensive: unicode database outdated?
|
188 |
+
return False
|
189 |
+
|
190 |
+
return "ARABIC" in character_name
|
191 |
+
|
192 |
+
|
193 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
194 |
+
def is_arabic_isolated_form(character: str) -> bool:
|
195 |
+
try:
|
196 |
+
character_name = unicodedata.name(character)
|
197 |
+
except ValueError: # Defensive: unicode database outdated?
|
198 |
+
return False
|
199 |
+
|
200 |
+
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
201 |
+
|
202 |
+
|
203 |
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
204 |
+
def is_unicode_range_secondary(range_name: str) -> bool:
|
205 |
+
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
206 |
+
|
207 |
+
|
208 |
+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
209 |
+
def is_unprintable(character: str) -> bool:
|
210 |
+
return (
|
211 |
+
character.isspace() is False # includes \n \t \r \v
|
212 |
+
and character.isprintable() is False
|
213 |
+
and character != "\x1a" # Why? Its the ASCII substitute character.
|
214 |
+
and character != "\ufeff" # bug discovered in Python,
|
215 |
+
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
216 |
+
)
|
217 |
+
|
218 |
+
|
219 |
+
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
220 |
+
"""
|
221 |
+
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
222 |
+
"""
|
223 |
+
if not isinstance(sequence, bytes):
|
224 |
+
raise TypeError
|
225 |
+
|
226 |
+
seq_len: int = len(sequence)
|
227 |
+
|
228 |
+
results: list[str] = findall(
|
229 |
+
RE_POSSIBLE_ENCODING_INDICATION,
|
230 |
+
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
231 |
+
)
|
232 |
+
|
233 |
+
if len(results) == 0:
|
234 |
+
return None
|
235 |
+
|
236 |
+
for specified_encoding in results:
|
237 |
+
specified_encoding = specified_encoding.lower().replace("-", "_")
|
238 |
+
|
239 |
+
encoding_alias: str
|
240 |
+
encoding_iana: str
|
241 |
+
|
242 |
+
for encoding_alias, encoding_iana in aliases.items():
|
243 |
+
if encoding_alias == specified_encoding:
|
244 |
+
return encoding_iana
|
245 |
+
if encoding_iana == specified_encoding:
|
246 |
+
return encoding_iana
|
247 |
+
|
248 |
+
return None
|
249 |
+
|
250 |
+
|
251 |
+
@lru_cache(maxsize=128)
|
252 |
+
def is_multi_byte_encoding(name: str) -> bool:
|
253 |
+
"""
|
254 |
+
Verify is a specific encoding is a multi byte one based on it IANA name
|
255 |
+
"""
|
256 |
+
return name in {
|
257 |
+
"utf_8",
|
258 |
+
"utf_8_sig",
|
259 |
+
"utf_16",
|
260 |
+
"utf_16_be",
|
261 |
+
"utf_16_le",
|
262 |
+
"utf_32",
|
263 |
+
"utf_32_le",
|
264 |
+
"utf_32_be",
|
265 |
+
"utf_7",
|
266 |
+
} or issubclass(
|
267 |
+
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
268 |
+
MultibyteIncrementalDecoder,
|
269 |
+
)
|
270 |
+
|
271 |
+
|
272 |
+
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
273 |
+
"""
|
274 |
+
Identify and extract SIG/BOM in given sequence.
|
275 |
+
"""
|
276 |
+
|
277 |
+
for iana_encoding in ENCODING_MARKS:
|
278 |
+
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
279 |
+
|
280 |
+
if isinstance(marks, bytes):
|
281 |
+
marks = [marks]
|
282 |
+
|
283 |
+
for mark in marks:
|
284 |
+
if sequence.startswith(mark):
|
285 |
+
return iana_encoding, mark
|
286 |
+
|
287 |
+
return None, b""
|
288 |
+
|
289 |
+
|
290 |
+
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
291 |
+
return iana_encoding not in {"utf_16", "utf_32"}
|
292 |
+
|
293 |
+
|
294 |
+
def iana_name(cp_name: str, strict: bool = True) -> str:
|
295 |
+
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
296 |
+
cp_name = cp_name.lower().replace("-", "_")
|
297 |
+
|
298 |
+
encoding_alias: str
|
299 |
+
encoding_iana: str
|
300 |
+
|
301 |
+
for encoding_alias, encoding_iana in aliases.items():
|
302 |
+
if cp_name in [encoding_alias, encoding_iana]:
|
303 |
+
return encoding_iana
|
304 |
+
|
305 |
+
if strict:
|
306 |
+
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
307 |
+
|
308 |
+
return cp_name
|
309 |
+
|
310 |
+
|
311 |
+
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
312 |
+
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
313 |
+
return 0.0
|
314 |
+
|
315 |
+
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
316 |
+
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
317 |
+
|
318 |
+
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
319 |
+
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
320 |
+
|
321 |
+
character_match_count: int = 0
|
322 |
+
|
323 |
+
for i in range(255):
|
324 |
+
to_be_decoded: bytes = bytes([i])
|
325 |
+
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
326 |
+
character_match_count += 1
|
327 |
+
|
328 |
+
return character_match_count / 254
|
329 |
+
|
330 |
+
|
331 |
+
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
332 |
+
"""
|
333 |
+
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
334 |
+
the function cp_similarity.
|
335 |
+
"""
|
336 |
+
return (
|
337 |
+
iana_name_a in IANA_SUPPORTED_SIMILAR
|
338 |
+
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
339 |
+
)
|
340 |
+
|
341 |
+
|
342 |
+
def set_logging_handler(
|
343 |
+
name: str = "charset_normalizer",
|
344 |
+
level: int = logging.INFO,
|
345 |
+
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
346 |
+
) -> None:
|
347 |
+
logger = logging.getLogger(name)
|
348 |
+
logger.setLevel(level)
|
349 |
+
|
350 |
+
handler = logging.StreamHandler()
|
351 |
+
handler.setFormatter(logging.Formatter(format_string))
|
352 |
+
logger.addHandler(handler)
|
353 |
+
|
354 |
+
|
355 |
+
def cut_sequence_chunks(
|
356 |
+
sequences: bytes,
|
357 |
+
encoding_iana: str,
|
358 |
+
offsets: range,
|
359 |
+
chunk_size: int,
|
360 |
+
bom_or_sig_available: bool,
|
361 |
+
strip_sig_or_bom: bool,
|
362 |
+
sig_payload: bytes,
|
363 |
+
is_multi_byte_decoder: bool,
|
364 |
+
decoded_payload: str | None = None,
|
365 |
+
) -> Generator[str, None, None]:
|
366 |
+
if decoded_payload and is_multi_byte_decoder is False:
|
367 |
+
for i in offsets:
|
368 |
+
chunk = decoded_payload[i : i + chunk_size]
|
369 |
+
if not chunk:
|
370 |
+
break
|
371 |
+
yield chunk
|
372 |
+
else:
|
373 |
+
for i in offsets:
|
374 |
+
chunk_end = i + chunk_size
|
375 |
+
if chunk_end > len(sequences) + 8:
|
376 |
+
continue
|
377 |
+
|
378 |
+
cut_sequence = sequences[i : i + chunk_size]
|
379 |
+
|
380 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
381 |
+
cut_sequence = sig_payload + cut_sequence
|
382 |
+
|
383 |
+
chunk = cut_sequence.decode(
|
384 |
+
encoding_iana,
|
385 |
+
errors="ignore" if is_multi_byte_decoder else "strict",
|
386 |
+
)
|
387 |
+
|
388 |
+
# multi-byte bad cutting detector and adjustment
|
389 |
+
# not the cleanest way to perform that fix but clever enough for now.
|
390 |
+
if is_multi_byte_decoder and i > 0:
|
391 |
+
chunk_partial_size_chk: int = min(chunk_size, 16)
|
392 |
+
|
393 |
+
if (
|
394 |
+
decoded_payload
|
395 |
+
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
396 |
+
):
|
397 |
+
for j in range(i, i - 4, -1):
|
398 |
+
cut_sequence = sequences[j:chunk_end]
|
399 |
+
|
400 |
+
if bom_or_sig_available and strip_sig_or_bom is False:
|
401 |
+
cut_sequence = sig_payload + cut_sequence
|
402 |
+
|
403 |
+
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
404 |
+
|
405 |
+
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
406 |
+
break
|
407 |
+
|
408 |
+
yield chunk
|
env/Lib/site-packages/charset_normalizer/version.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Expose version
|
3 |
+
"""
|
4 |
+
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
__version__ = "3.4.1"
|
8 |
+
VERSION = __version__.split(".")
|
env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: colorama
|
3 |
+
Version: 0.4.6
|
4 |
+
Summary: Cross-platform colored terminal text.
|
5 |
+
Project-URL: Homepage, https://github.com/tartley/colorama
|
6 |
+
Author-email: Jonathan Hartley <[email protected]>
|
7 |
+
License-File: LICENSE.txt
|
8 |
+
Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
|
9 |
+
Classifier: Development Status :: 5 - Production/Stable
|
10 |
+
Classifier: Environment :: Console
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: License :: OSI Approved :: BSD License
|
13 |
+
Classifier: Operating System :: OS Independent
|
14 |
+
Classifier: Programming Language :: Python
|
15 |
+
Classifier: Programming Language :: Python :: 2
|
16 |
+
Classifier: Programming Language :: Python :: 2.7
|
17 |
+
Classifier: Programming Language :: Python :: 3
|
18 |
+
Classifier: Programming Language :: Python :: 3.7
|
19 |
+
Classifier: Programming Language :: Python :: 3.8
|
20 |
+
Classifier: Programming Language :: Python :: 3.9
|
21 |
+
Classifier: Programming Language :: Python :: 3.10
|
22 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
24 |
+
Classifier: Topic :: Terminals
|
25 |
+
Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
|
26 |
+
Description-Content-Type: text/x-rst
|
27 |
+
|
28 |
+
.. image:: https://img.shields.io/pypi/v/colorama.svg
|
29 |
+
:target: https://pypi.org/project/colorama/
|
30 |
+
:alt: Latest Version
|
31 |
+
|
32 |
+
.. image:: https://img.shields.io/pypi/pyversions/colorama.svg
|
33 |
+
:target: https://pypi.org/project/colorama/
|
34 |
+
:alt: Supported Python versions
|
35 |
+
|
36 |
+
.. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
|
37 |
+
:target: https://github.com/tartley/colorama/actions/workflows/test.yml
|
38 |
+
:alt: Build Status
|
39 |
+
|
40 |
+
Colorama
|
41 |
+
========
|
42 |
+
|
43 |
+
Makes ANSI escape character sequences (for producing colored terminal text and
|
44 |
+
cursor positioning) work under MS Windows.
|
45 |
+
|
46 |
+
.. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
|
47 |
+
:target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama¤cy_code=USD
|
48 |
+
:alt: Donate with Paypal
|
49 |
+
|
50 |
+
`PyPI for releases <https://pypi.org/project/colorama/>`_ |
|
51 |
+
`Github for source <https://github.com/tartley/colorama>`_ |
|
52 |
+
`Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
|
53 |
+
|
54 |
+
If you find Colorama useful, please |donate| to the authors. Thank you!
|
55 |
+
|
56 |
+
Installation
|
57 |
+
------------
|
58 |
+
|
59 |
+
Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
|
60 |
+
|
61 |
+
No requirements other than the standard library.
|
62 |
+
|
63 |
+
.. code-block:: bash
|
64 |
+
|
65 |
+
pip install colorama
|
66 |
+
# or
|
67 |
+
conda install -c anaconda colorama
|
68 |
+
|
69 |
+
Description
|
70 |
+
-----------
|
71 |
+
|
72 |
+
ANSI escape character sequences have long been used to produce colored terminal
|
73 |
+
text and cursor positioning on Unix and Macs. Colorama makes this work on
|
74 |
+
Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
|
75 |
+
would appear as gobbledygook in the output), and converting them into the
|
76 |
+
appropriate win32 calls to modify the state of the terminal. On other platforms,
|
77 |
+
Colorama does nothing.
|
78 |
+
|
79 |
+
This has the upshot of providing a simple cross-platform API for printing
|
80 |
+
colored terminal text from Python, and has the happy side-effect that existing
|
81 |
+
applications or libraries which use ANSI sequences to produce colored output on
|
82 |
+
Linux or Macs can now also work on Windows, simply by calling
|
83 |
+
``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
|
84 |
+
(all versions, but may have other side-effects – see below).
|
85 |
+
|
86 |
+
An alternative approach is to install ``ansi.sys`` on Windows machines, which
|
87 |
+
provides the same behaviour for all applications running in terminals. Colorama
|
88 |
+
is intended for situations where that isn't easy (e.g., maybe your app doesn't
|
89 |
+
have an installer.)
|
90 |
+
|
91 |
+
Demo scripts in the source code repository print some colored text using
|
92 |
+
ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
|
93 |
+
handling, versus on Windows Command-Prompt using Colorama:
|
94 |
+
|
95 |
+
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
|
96 |
+
:width: 661
|
97 |
+
:height: 357
|
98 |
+
:alt: ANSI sequences on Ubuntu under gnome-terminal.
|
99 |
+
|
100 |
+
.. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
|
101 |
+
:width: 668
|
102 |
+
:height: 325
|
103 |
+
:alt: Same ANSI sequences on Windows, using Colorama.
|
104 |
+
|
105 |
+
These screenshots show that, on Windows, Colorama does not support ANSI 'dim
|
106 |
+
text'; it looks the same as 'normal text'.
|
107 |
+
|
108 |
+
Usage
|
109 |
+
-----
|
110 |
+
|
111 |
+
Initialisation
|
112 |
+
..............
|
113 |
+
|
114 |
+
If the only thing you want from Colorama is to get ANSI escapes to work on
|
115 |
+
Windows, then run:
|
116 |
+
|
117 |
+
.. code-block:: python
|
118 |
+
|
119 |
+
from colorama import just_fix_windows_console
|
120 |
+
just_fix_windows_console()
|
121 |
+
|
122 |
+
If you're on a recent version of Windows 10 or better, and your stdout/stderr
|
123 |
+
are pointing to a Windows console, then this will flip the magic configuration
|
124 |
+
switch to enable Windows' built-in ANSI support.
|
125 |
+
|
126 |
+
If you're on an older version of Windows, and your stdout/stderr are pointing to
|
127 |
+
a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
|
128 |
+
magic file object that intercepts ANSI escape sequences and issues the
|
129 |
+
appropriate Win32 calls to emulate them.
|
130 |
+
|
131 |
+
In all other circumstances, it does nothing whatsoever. Basically the idea is
|
132 |
+
that this makes Windows act like Unix with respect to ANSI escape handling.
|
133 |
+
|
134 |
+
It's safe to call this function multiple times. It's safe to call this function
|
135 |
+
on non-Windows platforms, but it won't do anything. It's safe to call this
|
136 |
+
function when one or both of your stdout/stderr are redirected to a file – it
|
137 |
+
won't do anything to those streams.
|
138 |
+
|
139 |
+
Alternatively, you can use the older interface with more features (but also more
|
140 |
+
potential footguns):
|
141 |
+
|
142 |
+
.. code-block:: python
|
143 |
+
|
144 |
+
from colorama import init
|
145 |
+
init()
|
146 |
+
|
147 |
+
This does the same thing as ``just_fix_windows_console``, except for the
|
148 |
+
following differences:
|
149 |
+
|
150 |
+
- It's not safe to call ``init`` multiple times; you can end up with multiple
|
151 |
+
layers of wrapping and broken ANSI support.
|
152 |
+
|
153 |
+
- Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
|
154 |
+
and if it thinks they don't, then it will wrap ``sys.stdout`` and
|
155 |
+
``sys.stderr`` in a magic file object that strips out ANSI escape sequences
|
156 |
+
before printing them. This happens on all platforms, and can be convenient if
|
157 |
+
you want to write your code to emit ANSI escape sequences unconditionally, and
|
158 |
+
let Colorama decide whether they should actually be output. But note that
|
159 |
+
Colorama's heuristic is not particularly clever.
|
160 |
+
|
161 |
+
- ``init`` also accepts explicit keyword args to enable/disable various
|
162 |
+
functionality – see below.
|
163 |
+
|
164 |
+
To stop using Colorama before your program exits, simply call ``deinit()``.
|
165 |
+
This will restore ``stdout`` and ``stderr`` to their original values, so that
|
166 |
+
Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
|
167 |
+
cheaper than calling ``init()`` again (but does the same thing).
|
168 |
+
|
169 |
+
Most users should depend on ``colorama >= 0.4.6``, and use
|
170 |
+
``just_fix_windows_console``. The old ``init`` interface will be supported
|
171 |
+
indefinitely for backwards compatibility, but we don't plan to fix any issues
|
172 |
+
with it, also for backwards compatibility.
|
173 |
+
|
174 |
+
Colored Output
|
175 |
+
..............
|
176 |
+
|
177 |
+
Cross-platform printing of colored text can then be done using Colorama's
|
178 |
+
constant shorthand for ANSI escape sequences. These are deliberately
|
179 |
+
rudimentary, see below.
|
180 |
+
|
181 |
+
.. code-block:: python
|
182 |
+
|
183 |
+
from colorama import Fore, Back, Style
|
184 |
+
print(Fore.RED + 'some red text')
|
185 |
+
print(Back.GREEN + 'and with a green background')
|
186 |
+
print(Style.DIM + 'and in dim text')
|
187 |
+
print(Style.RESET_ALL)
|
188 |
+
print('back to normal now')
|
189 |
+
|
190 |
+
...or simply by manually printing ANSI sequences from your own code:
|
191 |
+
|
192 |
+
.. code-block:: python
|
193 |
+
|
194 |
+
print('\033[31m' + 'some red text')
|
195 |
+
print('\033[39m') # and reset to default color
|
196 |
+
|
197 |
+
...or, Colorama can be used in conjunction with existing ANSI libraries
|
198 |
+
such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
|
199 |
+
the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
|
200 |
+
or the incredible `_Rich <https://pypi.org/project/rich/>`_.
|
201 |
+
|
202 |
+
If you wish Colorama's Fore, Back and Style constants were more capable,
|
203 |
+
then consider using one of the above highly capable libraries to generate
|
204 |
+
colors, etc, and use Colorama just for its primary purpose: to convert
|
205 |
+
those ANSI sequences to also work on Windows:
|
206 |
+
|
207 |
+
SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
|
208 |
+
We are only interested in converting ANSI codes to win32 API calls, not
|
209 |
+
shortcuts like the above to generate ANSI characters.
|
210 |
+
|
211 |
+
.. code-block:: python
|
212 |
+
|
213 |
+
from colorama import just_fix_windows_console
|
214 |
+
from termcolor import colored
|
215 |
+
|
216 |
+
# use Colorama to make Termcolor work on Windows too
|
217 |
+
just_fix_windows_console()
|
218 |
+
|
219 |
+
# then use Termcolor for all colored text output
|
220 |
+
print(colored('Hello, World!', 'green', 'on_red'))
|
221 |
+
|
222 |
+
Available formatting constants are::
|
223 |
+
|
224 |
+
Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
225 |
+
Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
|
226 |
+
Style: DIM, NORMAL, BRIGHT, RESET_ALL
|
227 |
+
|
228 |
+
``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
|
229 |
+
perform this reset automatically on program exit.
|
230 |
+
|
231 |
+
These are fairly well supported, but not part of the standard::
|
232 |
+
|
233 |
+
Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
234 |
+
Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
|
235 |
+
|
236 |
+
Cursor Positioning
|
237 |
+
..................
|
238 |
+
|
239 |
+
ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
|
240 |
+
an example of how to generate them.
|
241 |
+
|
242 |
+
Init Keyword Args
|
243 |
+
.................
|
244 |
+
|
245 |
+
``init()`` accepts some ``**kwargs`` to override default behaviour.
|
246 |
+
|
247 |
+
init(autoreset=False):
|
248 |
+
If you find yourself repeatedly sending reset sequences to turn off color
|
249 |
+
changes at the end of every print, then ``init(autoreset=True)`` will
|
250 |
+
automate that:
|
251 |
+
|
252 |
+
.. code-block:: python
|
253 |
+
|
254 |
+
from colorama import init
|
255 |
+
init(autoreset=True)
|
256 |
+
print(Fore.RED + 'some red text')
|
257 |
+
print('automatically back to default color again')
|
258 |
+
|
259 |
+
init(strip=None):
|
260 |
+
Pass ``True`` or ``False`` to override whether ANSI codes should be
|
261 |
+
stripped from the output. The default behaviour is to strip if on Windows
|
262 |
+
or if output is redirected (not a tty).
|
263 |
+
|
264 |
+
init(convert=None):
|
265 |
+
Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
|
266 |
+
output into win32 calls. The default behaviour is to convert if on Windows
|
267 |
+
and output is to a tty (terminal).
|
268 |
+
|
269 |
+
init(wrap=True):
|
270 |
+
On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
|
271 |
+
with proxy objects, which override the ``.write()`` method to do their work.
|
272 |
+
If this wrapping causes you problems, then this can be disabled by passing
|
273 |
+
``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
|
274 |
+
``strip`` or ``convert`` are True.
|
275 |
+
|
276 |
+
When wrapping is disabled, colored printing on non-Windows platforms will
|
277 |
+
continue to work as normal. To do cross-platform colored output, you can
|
278 |
+
use Colorama's ``AnsiToWin32`` proxy directly:
|
279 |
+
|
280 |
+
.. code-block:: python
|
281 |
+
|
282 |
+
import sys
|
283 |
+
from colorama import init, AnsiToWin32
|
284 |
+
init(wrap=False)
|
285 |
+
stream = AnsiToWin32(sys.stderr).stream
|
286 |
+
|
287 |
+
# Python 2
|
288 |
+
print >>stream, Fore.BLUE + 'blue text on stderr'
|
289 |
+
|
290 |
+
# Python 3
|
291 |
+
print(Fore.BLUE + 'blue text on stderr', file=stream)
|
292 |
+
|
293 |
+
Recognised ANSI Sequences
|
294 |
+
.........................
|
295 |
+
|
296 |
+
ANSI sequences generally take the form::
|
297 |
+
|
298 |
+
ESC [ <param> ; <param> ... <command>
|
299 |
+
|
300 |
+
Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
|
301 |
+
more params are passed to a ``<command>``. If no params are passed, it is
|
302 |
+
generally synonymous with passing a single zero. No spaces exist in the
|
303 |
+
sequence; they have been inserted here simply to read more easily.
|
304 |
+
|
305 |
+
The only ANSI sequences that Colorama converts into win32 calls are::
|
306 |
+
|
307 |
+
ESC [ 0 m # reset all (colors and brightness)
|
308 |
+
ESC [ 1 m # bright
|
309 |
+
ESC [ 2 m # dim (looks same as normal brightness)
|
310 |
+
ESC [ 22 m # normal brightness
|
311 |
+
|
312 |
+
# FOREGROUND:
|
313 |
+
ESC [ 30 m # black
|
314 |
+
ESC [ 31 m # red
|
315 |
+
ESC [ 32 m # green
|
316 |
+
ESC [ 33 m # yellow
|
317 |
+
ESC [ 34 m # blue
|
318 |
+
ESC [ 35 m # magenta
|
319 |
+
ESC [ 36 m # cyan
|
320 |
+
ESC [ 37 m # white
|
321 |
+
ESC [ 39 m # reset
|
322 |
+
|
323 |
+
# BACKGROUND
|
324 |
+
ESC [ 40 m # black
|
325 |
+
ESC [ 41 m # red
|
326 |
+
ESC [ 42 m # green
|
327 |
+
ESC [ 43 m # yellow
|
328 |
+
ESC [ 44 m # blue
|
329 |
+
ESC [ 45 m # magenta
|
330 |
+
ESC [ 46 m # cyan
|
331 |
+
ESC [ 47 m # white
|
332 |
+
ESC [ 49 m # reset
|
333 |
+
|
334 |
+
# cursor positioning
|
335 |
+
ESC [ y;x H # position cursor at x across, y down
|
336 |
+
ESC [ y;x f # position cursor at x across, y down
|
337 |
+
ESC [ n A # move cursor n lines up
|
338 |
+
ESC [ n B # move cursor n lines down
|
339 |
+
ESC [ n C # move cursor n characters forward
|
340 |
+
ESC [ n D # move cursor n characters backward
|
341 |
+
|
342 |
+
# clear the screen
|
343 |
+
ESC [ mode J # clear the screen
|
344 |
+
|
345 |
+
# clear the line
|
346 |
+
ESC [ mode K # clear the line
|
347 |
+
|
348 |
+
Multiple numeric params to the ``'m'`` command can be combined into a single
|
349 |
+
sequence::
|
350 |
+
|
351 |
+
ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
|
352 |
+
|
353 |
+
All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
|
354 |
+
are silently stripped from the output on Windows.
|
355 |
+
|
356 |
+
Any other form of ANSI sequence, such as single-character codes or alternative
|
357 |
+
initial characters, are not recognised or stripped. It would be cool to add
|
358 |
+
them though. Let me know if it would be useful for you, via the Issues on
|
359 |
+
GitHub.
|
360 |
+
|
361 |
+
Status & Known Problems
|
362 |
+
-----------------------
|
363 |
+
|
364 |
+
I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
|
365 |
+
(gnome-terminal, xterm), and OS X.
|
366 |
+
|
367 |
+
Some valid ANSI sequences aren't recognised.
|
368 |
+
|
369 |
+
If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
|
370 |
+
explanation there of why we do not want PRs that allow Colorama to generate new
|
371 |
+
types of ANSI codes.
|
372 |
+
|
373 |
+
See outstanding issues and wish-list:
|
374 |
+
https://github.com/tartley/colorama/issues
|
375 |
+
|
376 |
+
If anything doesn't work for you, or doesn't do what you expected or hoped for,
|
377 |
+
I'd love to hear about it on that issues list, would be delighted by patches,
|
378 |
+
and would be happy to grant commit access to anyone who submits a working patch
|
379 |
+
or two.
|
380 |
+
|
381 |
+
.. _README-hacking.md: README-hacking.md
|
382 |
+
|
383 |
+
License
|
384 |
+
-------
|
385 |
+
|
386 |
+
Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
|
387 |
+
LICENSE file.
|
388 |
+
|
389 |
+
Professional support
|
390 |
+
--------------------
|
391 |
+
|
392 |
+
.. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
|
393 |
+
:alt: Tidelift
|
394 |
+
:target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
395 |
+
|
396 |
+
.. list-table::
|
397 |
+
:widths: 10 100
|
398 |
+
|
399 |
+
* - |tideliftlogo|
|
400 |
+
- Professional support for colorama is available as part of the
|
401 |
+
`Tidelift Subscription`_.
|
402 |
+
Tidelift gives software development teams a single source for purchasing
|
403 |
+
and maintaining their software, with professional grade assurances from
|
404 |
+
the experts who know it best, while seamlessly integrating with existing
|
405 |
+
tools.
|
406 |
+
|
407 |
+
.. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
|
408 |
+
|
409 |
+
Thanks
|
410 |
+
------
|
411 |
+
|
412 |
+
See the CHANGELOG for more thanks!
|
413 |
+
|
414 |
+
* Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
|
415 |
+
* Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
|
416 |
+
providing a solution to issue #7's setuptools/distutils debate,
|
417 |
+
and other fixes.
|
418 |
+
* User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
|
419 |
+
* Matthew McCormick for politely pointing out a longstanding crash on non-Win.
|
420 |
+
* Ben Hoyt, for a magnificent fix under 64-bit Windows.
|
421 |
+
* Jesse at Empty Square for submitting a fix for examples in the README.
|
422 |
+
* User 'jamessp', an observant documentation fix for cursor positioning.
|
423 |
+
* User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
|
424 |
+
fix.
|
425 |
+
* Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
|
426 |
+
* Daniel Griffith for multiple fabulous patches.
|
427 |
+
* Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
|
428 |
+
output.
|
429 |
+
* Roger Binns, for many suggestions, valuable feedback, & bug reports.
|
430 |
+
* Tim Golden for thought and much appreciated feedback on the initial idea.
|
431 |
+
* User 'Zearin' for updates to the README file.
|
432 |
+
* John Szakmeister for adding support for light colors
|
433 |
+
* Charles Merriam for adding documentation to demos
|
434 |
+
* Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
|
435 |
+
* Florian Bruhin for a fix when stdout or stderr are None
|
436 |
+
* Thomas Weininger for fixing ValueError on Windows
|
437 |
+
* Remi Rampin for better Github integration and fixes to the README file
|
438 |
+
* Simeon Visser for closing a file handle using 'with' and updating classifiers
|
439 |
+
to include Python 3.3 and 3.4
|
440 |
+
* Andy Neff for fixing RESET of LIGHT_EX colors.
|
441 |
+
* Jonathan Hartley for the initial idea and implementation.
|
env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
|
3 |
+
colorama-0.4.6.dist-info/RECORD,,
|
4 |
+
colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
|
5 |
+
colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
|
6 |
+
colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
|
7 |
+
colorama/__pycache__/__init__.cpython-312.pyc,,
|
8 |
+
colorama/__pycache__/ansi.cpython-312.pyc,,
|
9 |
+
colorama/__pycache__/ansitowin32.cpython-312.pyc,,
|
10 |
+
colorama/__pycache__/initialise.cpython-312.pyc,,
|
11 |
+
colorama/__pycache__/win32.cpython-312.pyc,,
|
12 |
+
colorama/__pycache__/winterm.cpython-312.pyc,,
|
13 |
+
colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
|
14 |
+
colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
|
15 |
+
colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
|
16 |
+
colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
|
17 |
+
colorama/tests/__pycache__/__init__.cpython-312.pyc,,
|
18 |
+
colorama/tests/__pycache__/ansi_test.cpython-312.pyc,,
|
19 |
+
colorama/tests/__pycache__/ansitowin32_test.cpython-312.pyc,,
|
20 |
+
colorama/tests/__pycache__/initialise_test.cpython-312.pyc,,
|
21 |
+
colorama/tests/__pycache__/isatty_test.cpython-312.pyc,,
|
22 |
+
colorama/tests/__pycache__/utils.cpython-312.pyc,,
|
23 |
+
colorama/tests/__pycache__/winterm_test.cpython-312.pyc,,
|
24 |
+
colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
|
25 |
+
colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
|
26 |
+
colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
|
27 |
+
colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
|
28 |
+
colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
|
29 |
+
colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
|
30 |
+
colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
|
31 |
+
colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
|
env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: hatchling 1.11.1
|
3 |
+
Root-Is-Purelib: true
|
4 |
+
Tag: py2-none-any
|
5 |
+
Tag: py3-none-any
|