rudra0410hf commited on
Commit
1ccf66a
·
verified ·
1 Parent(s): f518ebf

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +4 -0
  2. .gitattributes +20 -35
  3. .gitignore +46 -0
  4. Dockerfile +17 -0
  5. app.py +112 -0
  6. brain_ai.py +113 -0
  7. code.ipynb +1758 -0
  8. demo.html +66 -0
  9. env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER +1 -0
  10. env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE +20 -0
  11. env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA +46 -0
  12. env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD +43 -0
  13. env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL +5 -0
  14. env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt +2 -0
  15. env/Lib/site-packages/_yaml/__init__.py +33 -0
  16. env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER +1 -0
  17. env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE +20 -0
  18. env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA +77 -0
  19. env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD +14 -0
  20. env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL +5 -0
  21. env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt +1 -0
  22. env/Lib/site-packages/certifi/__init__.py +4 -0
  23. env/Lib/site-packages/certifi/__main__.py +12 -0
  24. env/Lib/site-packages/certifi/cacert.pem +0 -0
  25. env/Lib/site-packages/certifi/core.py +114 -0
  26. env/Lib/site-packages/certifi/py.typed +0 -0
  27. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER +1 -0
  28. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE +21 -0
  29. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA +721 -0
  30. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD +35 -0
  31. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL +5 -0
  32. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt +2 -0
  33. env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt +1 -0
  34. env/Lib/site-packages/charset_normalizer/__init__.py +48 -0
  35. env/Lib/site-packages/charset_normalizer/__main__.py +6 -0
  36. env/Lib/site-packages/charset_normalizer/api.py +668 -0
  37. env/Lib/site-packages/charset_normalizer/cd.py +395 -0
  38. env/Lib/site-packages/charset_normalizer/cli/__init__.py +8 -0
  39. env/Lib/site-packages/charset_normalizer/cli/__main__.py +321 -0
  40. env/Lib/site-packages/charset_normalizer/constant.py +1998 -0
  41. env/Lib/site-packages/charset_normalizer/legacy.py +66 -0
  42. env/Lib/site-packages/charset_normalizer/md.py +630 -0
  43. env/Lib/site-packages/charset_normalizer/models.py +360 -0
  44. env/Lib/site-packages/charset_normalizer/py.typed +0 -0
  45. env/Lib/site-packages/charset_normalizer/utils.py +408 -0
  46. env/Lib/site-packages/charset_normalizer/version.py +8 -0
  47. env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER +1 -0
  48. env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA +441 -0
  49. env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD +31 -0
  50. env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL +5 -0
.dockerignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__
2
+ .git
3
+ .vscode
4
+ venv
.gitattributes CHANGED
@@ -1,35 +1,20 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
3
+ env/Lib/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text
4
+ env/Lib/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text
5
+ env/Lib/site-packages/pip/_vendor/distlib/w64-arm.exe filter=lfs diff=lfs merge=lfs -text
6
+ env/Lib/site-packages/pip/_vendor/distlib/w64.exe filter=lfs diff=lfs merge=lfs -text
7
+ env/Scripts/dotenv.exe filter=lfs diff=lfs merge=lfs -text
8
+ env/Scripts/huggingface-cli.exe filter=lfs diff=lfs merge=lfs -text
9
+ env/Scripts/normalizer.exe filter=lfs diff=lfs merge=lfs -text
10
+ env/Scripts/pip.exe filter=lfs diff=lfs merge=lfs -text
11
+ env/Scripts/pip3.12.exe filter=lfs diff=lfs merge=lfs -text
12
+ env/Scripts/pip3.exe filter=lfs diff=lfs merge=lfs -text
13
+ env/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
14
+ env/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
15
+ env/Scripts/tqdm.exe filter=lfs diff=lfs merge=lfs -text
16
+ static/progress/epoch_1_batch_0.png filter=lfs diff=lfs merge=lfs -text
17
+ static/progress/epoch_2_batch_0.png filter=lfs diff=lfs merge=lfs -text
18
+ static/progress/epoch_3_batch_0.png filter=lfs diff=lfs merge=lfs -text
19
+ static/progress/epoch_4_batch_0.png filter=lfs diff=lfs merge=lfs -text
20
+ static/progress/epoch_5_batch_0.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ venv/
8
+ .env
9
+ *.env
10
+
11
+ # Logs & Debugging
12
+ *.log
13
+ logs/
14
+ debug.log
15
+
16
+ # System Files
17
+ .DS_Store
18
+ Thumbs.db
19
+
20
+ # Python dependencies
21
+ Pipfile
22
+ Pipfile.lock
23
+ requirements.txt
24
+
25
+ # IDE & Editor Specific
26
+ .vscode/
27
+ .idea/
28
+ *.iml
29
+
30
+ # Compiled Python packages
31
+ *.egg
32
+ *.egg-info/
33
+ dist/
34
+ build/
35
+
36
+ # Hugging Face Cache
37
+ .huggingface/
38
+
39
+ # Node.js & Frontend files
40
+ node_modules/
41
+ npm-debug.log
42
+ yarn-error.log
43
+
44
+ # Flask Specific
45
+ instance/
46
+ config.py
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /BrainAI
6
+
7
+ # Copy the current directory contents into the container
8
+ COPY . /BrainAI
9
+
10
+ # Install any needed dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Expose the application port (default Flask port)
14
+ EXPOSE 5000
15
+
16
+ # Run the application
17
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ from flask import Flask, render_template, request, redirect, url_for, jsonify
5
+ from tensorflow.keras.layers import Layer
6
+ from tensorflow.keras.models import load_model
7
+ import matplotlib.pyplot as plt
8
+ import tensorflow as tf
9
+
10
+ app = Flask(__name__)
11
+ app.config['UPLOAD_FOLDER'] = 'static/uploads'
12
+ app.config['ALLOWED_EXTENSIONS'] = {'png', 'jpg', 'jpeg'}
13
+
14
+ class Sampling(tf.keras.layers.Layer):
15
+ def call(self, inputs):
16
+ z_mean, z_log_var = inputs
17
+ batch = tf.shape(z_mean)[0]
18
+ dim = tf.shape(z_mean)[1]
19
+ epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
20
+ return z_mean + tf.exp(0.5 * z_log_var) * epsilon
21
+
22
+ # Load models with explicit TensorFlow context
23
+ with tf.init_scope():
24
+ ct_to_mri_model = load_model('models/ct_to_mri_epoch_39.h5',
25
+ custom_objects={'Sampling': Sampling})
26
+ mri_to_ct_model = load_model('models/mri_to_ct_epoch_39.h5',
27
+ custom_objects={'Sampling': Sampling})
28
+
29
+ def allowed_file(filename):
30
+ return '.' in filename and \
31
+ filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
32
+
33
+ def process_image(image_path, model):
34
+ img = cv2.imread(image_path)
35
+ if img is None:
36
+ raise ValueError("Could not load image")
37
+
38
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
39
+ img = cv2.resize(img, (256, 256))
40
+ img = img.astype(np.float32) / 255.0
41
+ img = np.expand_dims(img, axis=0)
42
+
43
+ prediction = model.predict(img)
44
+ if isinstance(prediction, (list, tuple)):
45
+ prediction = prediction[0]
46
+
47
+ prediction = np.squeeze(prediction)
48
+ prediction = (prediction * 255).astype(np.uint8)
49
+ return prediction
50
+
51
+ def clean_uploads():
52
+ upload_dir = app.config['UPLOAD_FOLDER']
53
+ for filename in os.listdir(upload_dir):
54
+ file_path = os.path.join(upload_dir, filename)
55
+ try:
56
+ if os.path.isfile(file_path):
57
+ os.unlink(file_path)
58
+ except Exception as e:
59
+ print(f'Error deleting {file_path}: {e}')
60
+
61
+ @app.route('/')
62
+ def index():
63
+ return render_template('index.html')
64
+
65
+ @app.route('/try_now')
66
+ def try_now():
67
+ return render_template('try_now.html')
68
+
69
+ @app.route('/samples')
70
+ def samples():
71
+ return render_template('samples.html')
72
+
73
+ @app.route('/model_info')
74
+ def model_info():
75
+ return render_template('model_info.html')
76
+
77
+ @app.route('/translate', methods=['POST'])
78
+ def translate():
79
+ clean_uploads() # Clean previous uploads
80
+
81
+ if 'file' not in request.files:
82
+ return redirect(request.url)
83
+
84
+ file = request.files['file']
85
+ if file.filename == '':
86
+ return redirect(request.url)
87
+
88
+ if file and allowed_file(file.filename):
89
+ # Save original image
90
+ upload_path = os.path.join(app.config['UPLOAD_FOLDER'], 'original.png')
91
+ file.save(upload_path)
92
+
93
+ # Choose model based on translation direction
94
+ direction = request.form.get('direction')
95
+ model = ct_to_mri_model if direction == 'ct_to_mri' else mri_to_ct_model
96
+
97
+ try:
98
+ result = process_image(upload_path, model)
99
+ result_path = os.path.join(app.config['UPLOAD_FOLDER'], 'result.png')
100
+ plt.imsave(result_path, result)
101
+
102
+ return render_template('result.html',
103
+ original=url_for('static', filename='uploads/original.png'),
104
+ result=url_for('static', filename='uploads/result.png'))
105
+ except Exception as e:
106
+ return f"Error processing image: {str(e)}"
107
+
108
+ return redirect(url_for('try_now'))
109
+
110
+ if __name__ == '__main__':
111
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
112
+ app.run(host='0.0.0.0', port=5000, debug=False) # Set debug=False for production
brain_ai.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from langchain_ollama import ChatOllama
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import (
6
+ SystemMessagePromptTemplate,
7
+ HumanMessagePromptTemplate,
8
+ AIMessagePromptTemplate,
9
+ ChatPromptTemplate
10
+ )
11
+
12
+ st.title("🧠 BrainAI")
13
+ st.caption("🚀 Your own AI Neurologist with SuperPowers!!")
14
+
15
+ # Common user query suggestions
16
+ suggestions = [
17
+ "What are the early symptoms of a brain tumor?",
18
+ "How is a brain tumor diagnosed?",
19
+ "What are the treatment options for brain tumors?",
20
+ "Can a brain tumor be non-cancerous?",
21
+ "What lifestyle changes can help manage brain tumors?"
22
+ ]
23
+
24
+ # Display suggestions in rows and keep them fixed at the top
25
+ # st.write("### 💡 Common Questions")
26
+ suggestion_container = st.container()
27
+ with suggestion_container:
28
+ for query in suggestions:
29
+ if st.button(query, key=query):
30
+ st.session_state["user_input"] = query
31
+ st.rerun()
32
+
33
+ # Initiate chat engine
34
+ llm_engine = ChatOllama(
35
+ model="deepseek-r1:1.5b",
36
+ base_url="http://localhost:11434",
37
+ temperature=0.3
38
+ )
39
+
40
+ # System prompt
41
+ system_prompt = SystemMessagePromptTemplate.from_template("""
42
+ You are BrainAI, an AI-powered neurologist assistant designed to provide non-emergency guidance, education,
43
+ and support for neurological health. Your expertise includes brain anatomy, neurological disorders (e.g.,
44
+ epilepsy, Alzheimer’s, brain tumors, migraines), symptoms, diagnostics, and general brain health tips.
45
+ Always prioritize ethical guidelines, clarify your limitations, and emphasize consulting a licensed professional
46
+ for personal care. Answer only in English language.
47
+ """)
48
+
49
+ # Session management
50
+ if "message_log" not in st.session_state:
51
+ st.session_state.message_log = [{"role": "assistant", "content": "Hello! How can I assist you with brain health today?"}]
52
+
53
+ # Chat container
54
+ chat_container = st.container()
55
+
56
+ # Display messages with animation
57
+ def display_text_with_animation(text):
58
+ message_placeholder = st.empty()
59
+ displayed_text = ""
60
+ for char in text:
61
+ displayed_text += char
62
+ message_placeholder.markdown(displayed_text)
63
+ time.sleep(0.01)
64
+
65
+ with chat_container:
66
+ for message in st.session_state.message_log:
67
+ with st.chat_message(message["role"]):
68
+ if "<think>" in message["content"]:
69
+ parts = message["content"].split("</think>")
70
+ think_content = parts[0].replace("<think>", "").strip()
71
+ actual_response = parts[-1].strip()
72
+
73
+ with st.expander("🔍 View AI's Thinking Process"):
74
+ st.markdown(f"*Internal Analysis:*\n{think_content}")
75
+
76
+ display_text_with_animation(actual_response)
77
+ else:
78
+ display_text_with_animation(message["content"])
79
+
80
+ # Chat input
81
+ user_query = st.chat_input(" Ask anything about brain health ...")
82
+
83
+ # If a suggestion was selected, use it as the input
84
+ if "user_input" in st.session_state:
85
+ user_query = st.session_state["user_input"]
86
+ del st.session_state["user_input"]
87
+
88
+ def generate_ai_response(prompt_chain):
89
+ processing_pipeline = prompt_chain | llm_engine | StrOutputParser()
90
+ return processing_pipeline.invoke({})
91
+
92
+ def build_prompt_chain():
93
+ prompt_sequence = [system_prompt]
94
+ for msg in st.session_state.message_log:
95
+ if msg["role"] == "user":
96
+ prompt_sequence.append(HumanMessagePromptTemplate.from_template(msg["content"]))
97
+ elif msg["role"] == "assistant":
98
+ prompt_sequence.append(AIMessagePromptTemplate.from_template(msg["content"]))
99
+ return ChatPromptTemplate.from_messages(prompt_sequence)
100
+
101
+ if user_query:
102
+ st.session_state.message_log.append({"role": "user", "content": user_query})
103
+
104
+ with st.spinner("🧠 Thinking ..."):
105
+ prompt_chain = build_prompt_chain()
106
+ raw_response = generate_ai_response(prompt_chain)
107
+
108
+ st.session_state.message_log.append({
109
+ "role": "assistant",
110
+ "content": raw_response
111
+ })
112
+
113
+ st.rerun()
code.ipynb ADDED
@@ -0,0 +1,1758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "*#Image to Image Translation#*"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Requirement already satisfied: tensorflow==2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (2.15.0)\n",
20
+ "Requirement already satisfied: tensorflow-intel==2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow==2.15.0) (2.15.0)\n",
21
+ "Requirement already satisfied: grpcio<2.0,>=1.24.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.70.0)\n",
22
+ "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.31.0)\n",
23
+ "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.26.4)\n",
24
+ "Requirement already satisfied: packaging in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (24.2)\n",
25
+ "Requirement already satisfied: tensorflow-estimator<2.16,>=2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.0)\n",
26
+ "Requirement already satisfied: libclang>=13.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (18.1.1)\n",
27
+ "Requirement already satisfied: six>=1.12.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.17.0)\n",
28
+ "Requirement already satisfied: absl-py>=1.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.1.0)\n",
29
+ "Requirement already satisfied: setuptools in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (65.5.0)\n",
30
+ "Requirement already satisfied: h5py>=2.9.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.12.1)\n",
31
+ "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.6.0)\n",
32
+ "Requirement already satisfied: flatbuffers>=23.5.26 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (25.1.24)\n",
33
+ "Requirement already satisfied: google-pasta>=0.1.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.2.0)\n",
34
+ "Requirement already satisfied: termcolor>=1.1.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.5.0)\n",
35
+ "Requirement already satisfied: astunparse>=1.6.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.6.3)\n",
36
+ "Requirement already satisfied: typing-extensions>=3.6.6 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (4.12.2)\n",
37
+ "Requirement already satisfied: tensorboard<2.16,>=2.15 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.2)\n",
38
+ "Requirement already satisfied: keras<2.16,>=2.15.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.15.0)\n",
39
+ "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.20.3)\n",
40
+ "Requirement already satisfied: ml-dtypes~=0.2.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.2.0)\n",
41
+ "Requirement already satisfied: wrapt<1.15,>=1.11.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.14.1)\n",
42
+ "Requirement already satisfied: opt-einsum>=2.3.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.4.0)\n",
43
+ "Requirement already satisfied: wheel<1.0,>=0.23.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from astunparse>=1.6.0->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.45.1)\n",
44
+ "Requirement already satisfied: requests<3,>=2.21.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.32.3)\n",
45
+ "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.7.2)\n",
46
+ "Requirement already satisfied: werkzeug>=1.0.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.1.3)\n",
47
+ "Requirement already satisfied: markdown>=2.6.8 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.7)\n",
48
+ "Requirement already satisfied: google-auth<3,>=1.6.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.38.0)\n",
49
+ "Requirement already satisfied: google-auth-oauthlib<2,>=0.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (1.2.1)\n",
50
+ "Requirement already satisfied: cachetools<6.0,>=2.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (5.5.1)\n",
51
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.4.1)\n",
52
+ "Requirement already satisfied: rsa<5,>=3.1.4 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (4.9)\n",
53
+ "Requirement already satisfied: requests-oauthlib>=0.7.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.0.0)\n",
54
+ "Requirement already satisfied: charset-normalizer<4,>=2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.4.1)\n",
55
+ "Requirement already satisfied: idna<4,>=2.5 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.10)\n",
56
+ "Requirement already satisfied: certifi>=2017.4.17 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2025.1.31)\n",
57
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests<3,>=2.21.0->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (2.3.0)\n",
58
+ "Requirement already satisfied: MarkupSafe>=2.1.1 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from werkzeug>=1.0.1->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.0.2)\n",
59
+ "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (0.6.1)\n",
60
+ "Requirement already satisfied: oauthlib>=3.0.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<2,>=0.5->tensorboard<2.16,>=2.15->tensorflow-intel==2.15.0->tensorflow==2.15.0) (3.2.2)\n"
61
+ ]
62
+ },
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "\n",
68
+ "[notice] A new release of pip is available: 23.0.1 -> 25.0.1\n",
69
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "!pip install tensorflow==2.15.0\n"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 2,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "Requirement already satisfied: tensorflow-probability==0.23.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (0.23.0)\n",
87
+ "Requirement already satisfied: six>=1.10.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (1.17.0)\n",
88
+ "Requirement already satisfied: decorator in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (5.1.1)\n",
89
+ "Requirement already satisfied: cloudpickle>=1.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (3.1.1)\n",
90
+ "Requirement already satisfied: gast>=0.3.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (0.6.0)\n",
91
+ "Requirement already satisfied: numpy>=1.13.3 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (1.26.4)\n",
92
+ "Requirement already satisfied: absl-py in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (2.1.0)\n",
93
+ "Requirement already satisfied: dm-tree in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from tensorflow-probability==0.23.0) (0.1.9)\n",
94
+ "Requirement already satisfied: attrs>=18.2.0 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from dm-tree->tensorflow-probability==0.23.0) (25.1.0)\n",
95
+ "Requirement already satisfied: wrapt>=1.11.2 in d:\\vs code\\web dev\\projects\\image2image\\image\\lib\\site-packages (from dm-tree->tensorflow-probability==0.23.0) (1.14.1)\n"
96
+ ]
97
+ },
98
+ {
99
+ "name": "stderr",
100
+ "output_type": "stream",
101
+ "text": [
102
+ "\n",
103
+ "[notice] A new release of pip is available: 23.0.1 -> 25.0.1\n",
104
+ "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
105
+ ]
106
+ }
107
+ ],
108
+ "source": [
109
+ "!pip install tensorflow-probability==0.23.0"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "markdown",
114
+ "metadata": {},
115
+ "source": [
116
+ "*1️⃣ Import Necessary Libraries*\n",
117
+ "\n",
118
+ "1.*TensorFlow/Keras* for building and training deep learning models.\n",
119
+ "\n",
120
+ "2.*NumPy* for numerical operations/n.\n",
121
+ "\n",
122
+ "3.*Matplotlib* for visualizing the results.\n",
123
+ "\n",
124
+ "4.*OpenCV/PIL* for image processing.\n",
125
+ "\n",
126
+ "5.*TensorFlow* Addons for additional loss functions and layers (e.g., InstanceNorm).\n",
127
+ "\n",
128
+ "6.*TensorFlow Datasets* (or custom loaders) to load CT & MRI images."
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 3,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stdout",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
141
+ "\n",
142
+ "WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\tensorflow_probability\\python\\internal\\backend\\numpy\\_utils.py:48: The name tf.logging.TaskLevelStatusMessage is deprecated. Please use tf.compat.v1.logging.TaskLevelStatusMessage instead.\n",
143
+ "\n",
144
+ "WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\tensorflow_probability\\python\\internal\\backend\\numpy\\_utils.py:48: The name tf.control_flow_v2_enabled is deprecated. Please use tf.compat.v1.control_flow_v2_enabled instead.\n",
145
+ "\n"
146
+ ]
147
+ }
148
+ ],
149
+ "source": [
150
+ "import tensorflow as tf\n",
151
+ "from tensorflow.keras import layers, Model\n",
152
+ "import numpy as np\n",
153
+ "import cv2\n",
154
+ "import pathlib\n",
155
+ "import matplotlib.pyplot as plt\n",
156
+ "import tensorflow_probability as tfp\n",
157
+ "\n",
158
+ "tfd = tfp.distributions"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "markdown",
163
+ "metadata": {},
164
+ "source": [
165
+ "2️⃣ *Configuration (Hyperparameters)*\n",
166
+ "\n",
167
+ "This step defines the key settings for training.\n",
168
+ "\n",
169
+ "\n",
170
+ "Image size: The input image dimensions.\n",
171
+ "\n",
172
+ "Latent dimension: The size of the encoded representation in the VAE.\n",
173
+ "\n",
174
+ "Learning rate: Defines how fast the model updates weights.\n",
175
+ "\n",
176
+ "Batch size & epochs: Training parameters."
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 4,
182
+ "metadata": {},
183
+ "outputs": [],
184
+ "source": [
185
+ "IMAGE_SHAPE = (256, 256, 3)\n",
186
+ "LATENT_DIM = 256\n",
187
+ "FILTERS = 16\n",
188
+ "KERNEL = 3\n",
189
+ "LEARNING_RATE = 0.0001\n",
190
+ "WEIGHT_DECAY = 6e-8\n",
191
+ "BATCH_SIZE = 1\n",
192
+ "EPOCHS = 10"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "markdown",
197
+ "metadata": {},
198
+ "source": [
199
+ "* ===================== Architecture Components =====================*"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "metadata": {},
205
+ "source": [
206
+ "*3️⃣ Sampling Layer for Variational Autoencoder (VAE)*\n",
207
+ "\n",
208
+ "The sampling layer is a crucial part of the VAE, where we sample from the latent space.\n",
209
+ "\n",
210
+ "🔹 What We Need \n",
211
+ "\n",
212
+ "The encoder outputs μ (mean) and σ (log variance).\n",
213
+ "\n",
214
+ "This layer samples from a normal distribution using the reparameterization trick."
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "class Sampling(layers.Layer):\n",
224
+ " def call(self, inputs):\n",
225
+ " z_mean, z_log_var = inputs\n",
226
+ " batch = tf.shape(z_mean)[0]\n",
227
+ " dim = tf.shape(z_mean)[1]\n",
228
+ " epsilon = tf.random.normal(shape=(batch, dim))\n",
229
+ " return z_mean + tf.exp(0.5 * z_log_var) * epsilon"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "markdown",
234
+ "metadata": {},
235
+ "source": [
236
+ "It inherits from layers.Layer, meaning it's a custom layer that can be used like any other Keras layer.\n",
237
+ "\n",
238
+ "📍inputs is a tuple containing:\n",
239
+ "\n",
240
+ " z_mean: The mean vector output from the encoder.\n",
241
+ "\n",
242
+ " z_log_var: The log variance vector output from the encoder.\n",
243
+ "\n",
244
+ "📍This unpacks the inputs into two separate variables:\n",
245
+ "\n",
246
+ " z_mean: Represents the mean of the latent distribution.\n",
247
+ "\n",
248
+ " z_log_var: Represents the log variance of the latent distribution.\n",
249
+ "\n",
250
+ "📍Why log variance?\n",
251
+ "\n",
252
+ "Instead of using variance (σ²), we use log(σ²) because:\n",
253
+ "\n",
254
+ "Numerical Stability: Log prevents exploding/vanishing gradients.\n",
255
+ "\n",
256
+ "Easier Optimization: exp(log(σ²) / 2) makes variance always positive.\n",
257
+ "\n",
258
+ "\n",
259
+ "📍This determines:\n",
260
+ " batch: The number of samples in the batch.\n",
261
+ " dim: The size of the latent space (e.g., 128 if LATENT_DIM = 128).\n",
262
+ "\n",
263
+ "\n",
264
+ "📍Generates random values from a standard normal distribution (𝒩(0,1)).\n",
265
+ "\n",
266
+ "epsilon.shape = (batch, dim), meaning every sample gets a unique noise vector.\n",
267
+ "\n",
268
+ "Why do we need epsilon?\n",
269
+ "\n",
270
+ "Instead of directly using z_mean, we add controlled randomness to ensure the VAE learns a smooth latent space.\n",
271
+ "\n",
272
+ "* Reparameterization Trick*\n",
273
+ "\n",
274
+ " The latent space follows a normal distribution:\n",
275
+ "\n",
276
+ " 𝑧 ∼ 𝒩(μ, σ²)\n",
277
+ "\n",
278
+ " A sample is drawn from this distribution:\n",
279
+ "\n",
280
+ " 𝑧 = μ + σ * ε, where ε ∼ 𝒩(0,1).\n",
281
+ "\n",
282
+ " Since z_log_var = log(σ²), we compute:\n",
283
+ "\n",
284
+ " σ = exp(0.5 * log(σ²)) = exp(0.5 * z_log_var).\n",
285
+ "\n",
286
+ "\n",
287
+ "\n"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "markdown",
292
+ "metadata": {},
293
+ "source": [
294
+ "*🔹 Residual Block in Detail*\n",
295
+ "\n",
296
+ "This function defines a residual block, a key building block inspired by ResNet (Residual Networks). Residual blocks help in training deep neural networks efficiently by allowing gradient flow through skip connections.\n",
297
+ "\n",
298
+ "inputs: The input tensor (features from the previous layer).\n",
299
+ "\n",
300
+ "filters: The number of filters (channels) in the convolution layers.\n",
301
+ "\n",
302
+ "use_norm: Whether to apply Group Normalization (helps stabilize training)\n",
303
+ "\n",
304
+ "Step 1️⃣: First Convolution + Activation :\n",
305
+ "\n",
306
+ " Applies a 2D Convolution (Conv2D) with filters filters.\n",
307
+ "\n",
308
+ " KERNEL (not defined in this function) should be the kernel size (e.g., 3x3 or 5x5).\n",
309
+ "\n",
310
+ " padding='same': Ensures the output size is the same as the input.\n",
311
+ "\n",
312
+ " Leaky ReLU activation (alpha=0.2):\n",
313
+ "\n",
314
+ " Helps avoid dead neurons (better than regular ReLU).\n",
315
+ " \n",
316
+ " Allows a small gradient flow for negative values.\n",
317
+ "\n",
318
+ "Step 2️⃣: Group Normalization (Optional)\n",
319
+ "\n",
320
+ "Step 3️⃣: Second Convolution + Activation\n",
321
+ "\n",
322
+ " Applies another Conv2D layer with the same number of filters.\n",
323
+ "\n",
324
+ " Uses LeakyReLU again for better gradient flow.\n",
325
+ "\n",
326
+ " Why two convolutions?\n",
327
+ "\n",
328
+ " The first convolution learns low-level features.\n",
329
+ " \n",
330
+ " The second convolution refines the learned features.\n",
331
+ "\n",
332
+ "\n",
333
+ "Step 5️⃣: Shortcut Connection (Skip Connection)\n",
334
+ "\n",
335
+ " The original input is passed through a 1x1 convolution.\n",
336
+ "\n",
337
+ " This matches the number of filters with the residual output.\n",
338
+ "\n",
339
+ " Why 1x1 convolution?\n",
340
+ "\n",
341
+ " Ensures the shortcut has the same number of filters as x.\n",
342
+ "\n",
343
+ " Helps in adjusting dimensions when the number of channels changes.\n",
344
+ "\n",
345
+ "Step 6️⃣: Merge Shortcut & Residual Path\n",
346
+ "\n",
347
+ " Merges the shortcut and residual path using element-wise maximum.\n",
348
+ " \n",
349
+ " Why maximum() instead of addition (+)?\n",
350
+ "\n",
351
+ " Prevents negative values, which can help improve training stability.\n",
352
+ " \n",
353
+ " Focuses on stronger features from either the residual or shortcut path.\n",
354
+ "\n",
355
+ "\n",
356
+ "\n",
357
+ "\n",
358
+ "\n",
359
+ "\n",
360
+ "\n",
361
+ "\n"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 5,
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "def residual_block(inputs, filters, use_norm=True):\n",
371
+ " x = layers.Conv2D(filters, KERNEL, padding='same')(inputs)\n",
372
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
373
+ " if use_norm:\n",
374
+ " x = layers.GroupNormalization(groups=1)(x)\n",
375
+ " x = layers.Conv2D(filters, KERNEL, padding='same')(x)\n",
376
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
377
+ " if use_norm:\n",
378
+ " x = layers.GroupNormalization(groups=1)(x)\n",
379
+ " shortcut = layers.Conv2D(filters, 1, padding='same')(inputs)\n",
380
+ " return layers.maximum([x, shortcut])"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "markdown",
385
+ "metadata": {},
386
+ "source": [
387
+ "* 1️⃣ Encoder and Decoder Block*\n",
388
+ "\n",
389
+ "*Encoder*\n",
390
+ "\n",
391
+ "1️⃣ Pass Input Through Residual Block\n",
392
+ "\n",
393
+ "Uses a residual block (previously defined).\n",
394
+ "\n",
395
+ "Extracts important features while keeping the original information.\n",
396
+ "\n",
397
+ "Helps prevent vanishing gradients and allows deep networks to train effectively.\n",
398
+ "\n",
399
+ "2️⃣ Store the Skip Connection\n",
400
+ "\n",
401
+ "The skip connection stores the output of the residual block.\n",
402
+ "\n",
403
+ "It will be used later in the decoder to restore lost details.\n",
404
+ "\n",
405
+ "3️⃣ Downsampling (Reduce Spatial Size)\n",
406
+ "\n",
407
+ "Applies Max Pooling to reduce the spatial size (height & width).\n",
408
+ "\n",
409
+ "Why?\n",
410
+ "\n",
411
+ "Reduces computation.\n",
412
+ "\n",
413
+ "Forces the model to learn high-level features instead of pixel details.\n",
414
+ "\n",
415
+ "4️⃣ Return Downsampled Output & Skip Connection\n",
416
+ "\n",
417
+ "Outputs:\n",
418
+ "\n",
419
+ "x: The downsampled feature map.\n",
420
+ "\n",
421
+ "skip: The saved feature map (used later in the decoder).\n",
422
+ "\n",
423
+ "\n",
424
+ "🔥 2️⃣ Decoder Block\n",
425
+ "\n",
426
+ "1️⃣ Upsampling (Increase Spatial Size):\n",
427
+ "\n",
428
+ "Uses Conv2DTranspose (transposed convolution, aka deconvolution).\n",
429
+ "\n",
430
+ "Upsamples the input by a factor of 2 (increases spatial size).\n",
431
+ "\n",
432
+ "Why?\n",
433
+ "\n",
434
+ "Increases resolution to match the original input image.\n",
435
+ "\n",
436
+ "2️⃣ Merge Skip Connection\n",
437
+ "\n",
438
+ "Combines the upsampled output with the skip connection.\n",
439
+ "\n",
440
+ "Uses element-wise maximum instead of addition.\n",
441
+ "\n",
442
+ "Why?\n",
443
+ "\n",
444
+ "Ensures the model focuses on the most important features.\n",
445
+ "\n",
446
+ "Prevents loss of key information during encoding.\n",
447
+ "\n",
448
+ "3️⃣ Apply a Residual Block\n",
449
+ "\n",
450
+ "Uses a residual block to refine the upsampled output.\n",
451
+ "\n",
452
+ "Helps recover lost details and maintain stability.\n",
453
+ "\n",
454
+ "4️⃣ Return the Processed Output\n",
455
+ "\n",
456
+ "Returns the final feature map after upsampling and refinement.\n",
457
+ "\n",
458
+ "\n"
459
+ ]
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "execution_count": 6,
464
+ "metadata": {},
465
+ "outputs": [],
466
+ "source": [
467
+ "def encoder_block(inputs, filters, use_norm=True):\n",
468
+ " x = residual_block(inputs, filters, use_norm)\n",
469
+ " skip = x\n",
470
+ " x = layers.MaxPooling2D()(x)\n",
471
+ " return x, skip\n",
472
+ "\n",
473
+ "def decoder_block(inputs, skip, filters, use_norm=True):\n",
474
+ " x = layers.Conv2DTranspose(filters, KERNEL, strides=2, padding='same')(inputs)\n",
475
+ " x = layers.maximum([x, skip])\n",
476
+ " x = residual_block(x, filters, use_norm)\n",
477
+ " return x\n"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "markdown",
482
+ "metadata": {},
483
+ "source": [
484
+ "* ===================== Generator =====================*"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "markdown",
489
+ "metadata": {},
490
+ "source": [
491
+ "This function builds the generator model for a Variational Autoencoder (VAE) with a CycleGAN architecture. The generator is responsible for converting a CT scan into an MRI image (or vice versa) by learning to map the two domains.\n",
492
+ "\n",
493
+ ".\n",
494
+ "\n",
495
+ "🛠️ What This Function Does?\n",
496
+ "\n",
497
+ "It encodes an input image into a latent space.\n",
498
+ "\n",
499
+ "It applies variational sampling to introduce a probabilistic distribution.\n",
500
+ "\n",
501
+ "It decodes the latent representation back into an image.\n",
502
+ "\n",
503
+ "Uses skip connections to retain features across layers.\n",
504
+ "\n",
505
+ "1️⃣ Input Layer\n",
506
+ "\n",
507
+ "Defines the input tensor with the given IMAGE_SHAPE (e.g., (256, 256, 3), for RGB images).\n",
508
+ "\n",
509
+ "2️⃣ Encoder: Downsampling the Image\n",
510
+ "\n",
511
+ " Each encoder block halves the spatial resolution but doubles the filters.\n",
512
+ "\n",
513
+ " Stores skip connections (s1, s2, ..., s7) for later use in the decoder.\n",
514
+ "\n",
515
+ " After e7, the image is highly compressed into a feature map.\n",
516
+ "\n",
517
+ "3️⃣ Latent Space (Variational Sampling)\n",
518
+ "\n",
519
+ " Flattens the feature map into a 1D vector.\n",
520
+ "\n",
521
+ " Uses two dense layers to compute:\n",
522
+ "\n",
523
+ " z_mean → The mean of the latent distribution.\n",
524
+ "\n",
525
+ " z_log_var → The logarithm of the variance.\n",
526
+ "\n",
527
+ " Uses reparameterization trick (Sampling layer) to ensure backpropagation works in VAE.\n",
528
+ "\n",
529
+ "4️⃣ Reshape for Decoder\n",
530
+ "\n",
531
+ " Expands z into a 2x2 feature map to match e7 dimensions.\n",
532
+ "\n",
533
+ " Prepares the latent vector for decoding.\n",
534
+ "\n",
535
+ "5️⃣ Decoder: Upsampling the Image\n",
536
+ "\n",
537
+ " Each decoder block upsamples the feature map back to the original size.\n",
538
+ "\n",
539
+ " Uses skip connections (s1, s2, ..., s7) to restore spatial information.\n",
540
+ "\n",
541
+ " Mirrors the encoder process but in reverse.\n",
542
+ "\n",
543
+ "6️⃣ Final Output Layer\n",
544
+ "\n",
545
+ " Uses a Conv2D layer to produce the final RGB image.\n",
546
+ " \n",
547
+ " Applies sigmoid activation to ensure pixel values remain between [0,1].\n",
548
+ "\n",
549
+ "\n",
550
+ "\n",
551
+ "\n",
552
+ "\n",
553
+ "\n"
554
+ ]
555
+ },
556
+ {
557
+ "cell_type": "code",
558
+ "execution_count": 7,
559
+ "metadata": {},
560
+ "outputs": [],
561
+ "source": [
562
+ "def build_generator(name):\n",
563
+ " inputs = layers.Input(IMAGE_SHAPE)\n",
564
+ " \n",
565
+ " # Encoder\n",
566
+ " e1, s1 = encoder_block(inputs, FILTERS)\n",
567
+ " e2, s2 = encoder_block(e1, FILTERS*2)\n",
568
+ " e3, s3 = encoder_block(e2, FILTERS*4)\n",
569
+ " e4, s4 = encoder_block(e3, FILTERS*8)\n",
570
+ " e5, s5 = encoder_block(e4, FILTERS*16)\n",
571
+ " e6, s6 = encoder_block(e5, FILTERS*32)\n",
572
+ " e7, s7 = encoder_block(e6, FILTERS*64)\n",
573
+ " \n",
574
+ " # Latent Space\n",
575
+ " x = layers.Flatten()(e7)\n",
576
+ " z_mean = layers.Dense(LATENT_DIM, name=f\"z_mean_{name.split('_')[-1]}\")(x)\n",
577
+ " z_log_var = layers.Dense(LATENT_DIM, name=f\"z_log_var_{name.split('_')[-1]}\")(x)\n",
578
+ " z = Sampling()([z_mean, z_log_var])\n",
579
+ " \n",
580
+ " # Reshape for decoder\n",
581
+ " x = layers.Dense(2 * 2 * FILTERS*64)(z)\n",
582
+ " x = layers.Reshape((2, 2, FILTERS*64))(x)\n",
583
+ " \n",
584
+ " # Decoder\n",
585
+ " d0 = decoder_block(x, s7, FILTERS*64)\n",
586
+ " d1 = decoder_block(d0, s6, FILTERS*32)\n",
587
+ " d2 = decoder_block(d1, s5, FILTERS*16)\n",
588
+ " d3 = decoder_block(d2, s4, FILTERS*8)\n",
589
+ " d4 = decoder_block(d3, s3, FILTERS*4)\n",
590
+ " d5 = decoder_block(d4, s2, FILTERS*2)\n",
591
+ " d6 = decoder_block(d5, s1, FILTERS)\n",
592
+ " \n",
593
+ " outputs = layers.Conv2D(3, KERNEL, activation='sigmoid', padding='same')(d6)\n",
594
+ " return Model(inputs, [outputs, z_mean, z_log_var], name=name)"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "markdown",
599
+ "metadata": {},
600
+ "source": [
601
+ "\n",
602
+ "*===================== Discriminator =====================*\n",
603
+ "\n",
604
+ "\n",
605
+ "This function constructs the discriminator in a Generative Adversarial Network (GAN). The discriminator’s role is to classify an image as real or fake by extracting hierarchical features and making multi-scale predictions.\n",
606
+ "\n",
607
+ "What Does This Function Do?\n",
608
+ "\n",
609
+ " Extracts features from the input image using convolutional layers.\n",
610
+ "\n",
611
+ " Downsamples the image through multiple layers to capture both local and global features.\n",
612
+ "\n",
613
+ " Generates multiple outputs from different feature scales for better discrimination.\n",
614
+ "\n",
615
+ "1️⃣ Input Layer \n",
616
+ "\n",
617
+ " Defines the input tensor with a shape of IMAGE_SHAPE (e.g., (256, 256, 3) for RGB images).\n",
618
+ "\n",
619
+ " This means the discriminator takes an image as input.\n",
620
+ "\n",
621
+ "2️⃣ Feature Extraction\n",
622
+ "\n",
623
+ " x = inputs initializes x as the input image.\n",
624
+ "\n",
625
+ " features = [] creates a list to store intermediate feature map\n",
626
+ "\n",
627
+ "3️⃣ Initial Convolution\n",
628
+ "\n",
629
+ " Applies a convolutional layer (Conv2D) with FILTERS (e.g., 64 filters) to extract basic edges and textures.\n",
630
+ "\n",
631
+ " Uses LeakyReLU activation (alpha=0.2) instead of ReLU to allow small gradients for negative values.\n",
632
+ "\n",
633
+ " Stores the feature map in features.\n",
634
+ "\n",
635
+ "4️⃣ Downsampling Blocks (Feature Hierarchy)\n",
636
+ "\n",
637
+ " Defines filter_sizes, increasing filter count at each stage to learn complex features.\n",
638
+ "\n",
639
+ " Uses a loop to pass x through multiple encoder_block layers:\n",
640
+ "\n",
641
+ " Each encoder_block downsamples the feature map (reducing spatial size).\n",
642
+ "\n",
643
+ " Each block doubles the number of filters to capture more detailed features.\n",
644
+ "\n",
645
+ " Stores all extracted feature maps in features.\n",
646
+ "\n",
647
+ "5️⃣ Multi-Scale Outputs (Final Classification Layers)\n",
648
+ "\n",
649
+ " The discriminator does not produce a single output; it uses multiple feature scales.\n",
650
+ "\n",
651
+ " Extracts the last 4 feature maps (features[-4:]) to classify at different resolutions.\n",
652
+ "\n",
653
+ " Each feature map is passed through a final Conv2D layer with 1 filter to predict real vs fake scores.\n",
654
+ "\n",
655
+ " Stores the outputs in outputs.\n",
656
+ "\n",
657
+ "6️⃣ Return the Discriminator Model\n",
658
+ "\n",
659
+ " Creates a Keras Model that takes an image as input and outputs multiple classification scores.\n",
660
+ "\n",
661
+ " This helps in making fine-grained real/fake decisions.\n"
662
+ ]
663
+ },
664
+ {
665
+ "cell_type": "code",
666
+ "execution_count": 9,
667
+ "metadata": {},
668
+ "outputs": [],
669
+ "source": [
670
+ "def build_discriminator(name):\n",
671
+ " inputs = layers.Input(IMAGE_SHAPE)\n",
672
+ " \n",
673
+ " # Feature extraction\n",
674
+ " x = inputs\n",
675
+ " features = []\n",
676
+ " \n",
677
+ " # Initial convolution\n",
678
+ " x = layers.Conv2D(FILTERS, KERNEL, padding='same')(x)\n",
679
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
680
+ " features.append(x)\n",
681
+ " \n",
682
+ " # Downsampling blocks\n",
683
+ " filter_sizes = [FILTERS*2, FILTERS*4, FILTERS*8, FILTERS*16, FILTERS*32, FILTERS*64]\n",
684
+ " for filters in filter_sizes:\n",
685
+ " x, _ = encoder_block(x, filters, use_norm=False)\n",
686
+ " features.append(x)\n",
687
+ " \n",
688
+ " # Multi-scale outputs\n",
689
+ " outputs = []\n",
690
+ " for i, feature in enumerate(features[-4:]):\n",
691
+ " out = layers.Conv2D(1, KERNEL, padding='same')(feature)\n",
692
+ " outputs.append(out)\n",
693
+ " \n",
694
+ " return Model(inputs, outputs, name=name)\n"
695
+ ]
696
+ },
697
+ {
698
+ "cell_type": "markdown",
699
+ "metadata": {},
700
+ "source": [
701
+ "*===================== Data Loading =====================*\n",
702
+ "\n"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": 10,
708
+ "metadata": {},
709
+ "outputs": [],
710
+ "source": [
711
+ "def load_images(path):\n",
712
+ " images = []\n",
713
+ " for p in pathlib.Path(path).glob('*.*'):\n",
714
+ " try:\n",
715
+ " img = cv2.imread(str(p))\n",
716
+ " if img is not None:\n",
717
+ " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
718
+ " img = cv2.resize(img, IMAGE_SHAPE[:2])\n",
719
+ " img = img.astype(np.float32) / 255.0\n",
720
+ " images.append(img)\n",
721
+ " except Exception as e:\n",
722
+ " print(f\"Error loading image {p}: {e}\")\n",
723
+ " return np.array(images)"
724
+ ]
725
+ },
726
+ {
727
+ "cell_type": "markdown",
728
+ "metadata": {},
729
+ "source": [
730
+ "This function is responsible for loading and balancing two different medical imaging datasets: CT scans and MRI scans. The goal is to ensure that both datasets contain the same number of images to avoid class imbalance in training.\n",
731
+ "\n",
732
+ "\n",
733
+ "📌 What Does This Function Do?\n",
734
+ "\n",
735
+ "Loads CT scans from the given directory.\n",
736
+ "Loads MRI scans from the given directory.\n",
737
+ "Finds the smaller dataset (CT or MRI) and trims the larger one to match its size.\n",
738
+ "Returns balanced datasets with the same number of images.\n",
739
+ "\n",
740
+ "1️⃣ Loading CT Scans:\n",
741
+ "\n",
742
+ " Prints \"Loading CT scans...\" to inform the user.\n",
743
+ "\n",
744
+ " Calls load_images(ct_path), a function (likely defined elsewhere) that reads images from the directory specified by ct_path.\n",
745
+ "\n",
746
+ " Stores the loaded images in ct_scans.\n",
747
+ "\n",
748
+ "2️⃣ Loading MRI Scans\n",
749
+ "\n",
750
+ " Prints \"Loading MRI scans...\" to indicate MRI loading.\n",
751
+ "\n",
752
+ " Calls load_images(mri_path), which loads images from mri_path.\n",
753
+ "\n",
754
+ " Stores the MRI images in mri_scans.\n",
755
+ "\n",
756
+ "3️⃣ Balancing the Datasets\n",
757
+ "\n",
758
+ " Computes the minimum length between the two datasets.\n",
759
+ "\n",
760
+ " Ensures that the dataset with more images is trimmed to match the smaller one.\n",
761
+ "\n",
762
+ " Computes the minimum length between the two datasets.\n",
763
+ "\n",
764
+ " Ensures that the dataset with more images is trimmed to match the smaller one.\n",
765
+ "\n",
766
+ "\n",
767
+ "\n",
768
+ "\n",
769
+ "\n",
770
+ "\n",
771
+ "\n",
772
+ "\n",
773
+ " \n",
774
+ "\n"
775
+ ]
776
+ },
777
+ {
778
+ "cell_type": "code",
779
+ "execution_count": 11,
780
+ "metadata": {},
781
+ "outputs": [],
782
+ "source": [
783
+ "\n",
784
+ "def load_and_balance_datasets(ct_path, mri_path):\n",
785
+ " print(\"Loading CT scans...\")\n",
786
+ " ct_scans = load_images(ct_path)\n",
787
+ " print(\"Loading MRI scans...\")\n",
788
+ " mri_scans = load_images(mri_path)\n",
789
+ " \n",
790
+ " min_length = min(len(ct_scans), len(mri_scans))\n",
791
+ " ct_scans = ct_scans[:min_length]\n",
792
+ " mri_scans = mri_scans[:min_length]\n",
793
+ " \n",
794
+ " print(f\"Balanced datasets to {min_length} images each\")\n",
795
+ " return ct_scans, mri_scans"
796
+ ]
797
+ },
798
+ {
799
+ "cell_type": "markdown",
800
+ "metadata": {},
801
+ "source": [
802
+ "*Training Setup - Detailed Explanation*\n",
803
+ "\n",
804
+ "This block of code sets up the models and optimizers required for training a CycleGAN for CT ↔ MRI image translation. Let’s break it down step by step.\n",
805
+ "\n",
806
+ "📌 What Does This Code Do?\n",
807
+ " Builds the generator models (CT → MRI and MRI → CT).\n",
808
+ "\n",
809
+ " Builds the discriminator models for CT and MRI.\n",
810
+ "\n",
811
+ " Creates optimizers for training the generators and discriminators.\n",
812
+ "\n",
813
+ " Initializes model variables (trainable parameters for both generators and discriminators).\n",
814
+ "\n",
815
+ " Builds optimizers using the trainable variables.\n",
816
+ "\n",
817
+ "1️⃣ Building the Generator Models\n",
818
+ "\n",
819
+ " build_generator(name): This function (explained earlier) builds a U-Net-based Variational Autoencoder (VAE) generator.\n",
820
+ "\n",
821
+ " g_ct_mri: The generator that converts CT scans → MRI images.\n",
822
+ "\n",
823
+ " g_mri_ct: The generator that converts MRI images → CT scans\n",
824
+ "\n",
825
+ "2️⃣ Building the Discriminator Models\n",
826
+ "\n",
827
+ " build_discriminator(name): This function (explained earlier) builds the discriminators to differentiate real and fake images.\n",
828
+ "\n",
829
+ " d_ct: The discriminator that distinguishes real CT scans from fake ones.\n",
830
+ "\n",
831
+ " d_mri: The discriminator that distinguishes real MRI scans from fake ones.\n",
832
+ "\n",
833
+ "\n",
834
+ "3️⃣ Creating Optimizers\n",
835
+ "\n",
836
+ " g_opt: Optimizer for training both generators.\n",
837
+ "\n",
838
+ " d_opt: Optimizer for training both discriminators.\n",
839
+ "\n",
840
+ " Uses RMSprop as the optimizer.\n",
841
+ "\n",
842
+ " The learning rate (LEARNING_RATE) controls the step size for updates.\n",
843
+ "\n",
844
+ " Weight decay (WEIGHT_DECAY) prevents overfitting by penalizing large weights.\n",
845
+ "\n",
846
+ "4️⃣ Initializing Model Variables\n",
847
+ "\n",
848
+ " g_vars: Stores all trainable variables (weights & biases) of both generators.\n",
849
+ " d_vars: Stores all trainable variables of both discriminators.\n",
850
+ "\n",
851
+ " 📝 Why store trainable variables separately?\n",
852
+ "\n",
853
+ " Since generators and discriminators have separate losses, they need to be updated separately.\n",
854
+ "\n",
855
+ "5️⃣ Building Optimizers with Model Variables\n",
856
+ "\n",
857
+ " g_opt.build(g_vars): Tells TensorFlow that g_opt will optimize generator variables.\n",
858
+ "\n",
859
+ " d_opt.build(d_vars): Tells TensorFlow that d_opt will optimize discriminator variables.\n",
860
+ "\n",
861
+ " 📝 Why explicitly build the optimizers?\n",
862
+ "\n",
863
+ " In Eager Execution mode, TensorFlow automatically tracks variables.\n",
864
+ " \n",
865
+ " However, explicitly calling build() can help with performance optimization.\n",
866
+ "\n",
867
+ "\n"
868
+ ]
869
+ },
870
+ {
871
+ "cell_type": "code",
872
+ "execution_count": 12,
873
+ "metadata": {},
874
+ "outputs": [
875
+ {
876
+ "name": "stdout",
877
+ "output_type": "stream",
878
+ "text": [
879
+ "WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n",
880
+ "\n",
881
+ "WARNING:tensorflow:From d:\\VS CODE\\Web Dev\\Projects\\Image2Image\\image\\lib\\site-packages\\keras\\src\\layers\\pooling\\max_pooling2d.py:161: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
882
+ "\n"
883
+ ]
884
+ },
885
+ {
886
+ "ename": "NameError",
887
+ "evalue": "name 'Sampling' is not defined",
888
+ "output_type": "error",
889
+ "traceback": [
890
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
891
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
892
+ "Cell \u001b[1;32mIn[12], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ===================== Training Setup =====================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Build models\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m g_ct_mri \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mCT_to_MRI\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m g_mri_ct \u001b[38;5;241m=\u001b[39m build_generator(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMRI_to_CT\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 5\u001b[0m d_ct \u001b[38;5;241m=\u001b[39m build_discriminator(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mD_CT\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
893
+ "Cell \u001b[1;32mIn[7], line 17\u001b[0m, in \u001b[0;36mbuild_generator\u001b[1;34m(name)\u001b[0m\n\u001b[0;32m 15\u001b[0m z_mean \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(LATENT_DIM, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mz_mean_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)(x)\n\u001b[0;32m 16\u001b[0m z_log_var \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(LATENT_DIM, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mz_log_var_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)(x)\n\u001b[1;32m---> 17\u001b[0m z \u001b[38;5;241m=\u001b[39m \u001b[43mSampling\u001b[49m()([z_mean, z_log_var])\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# Reshape for decoder\u001b[39;00m\n\u001b[0;32m 20\u001b[0m x \u001b[38;5;241m=\u001b[39m layers\u001b[38;5;241m.\u001b[39mDense(\u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m FILTERS\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m64\u001b[39m)(z)\n",
894
+ "\u001b[1;31mNameError\u001b[0m: name 'Sampling' is not defined"
895
+ ]
896
+ }
897
+ ],
898
+ "source": [
899
+ "# ===================== Training Setup =====================\n",
900
+ "# Build models\n",
901
+ "g_ct_mri = build_generator('CT_to_MRI')\n",
902
+ "g_mri_ct = build_generator('MRI_to_CT')\n",
903
+ "d_ct = build_discriminator('D_CT')\n",
904
+ "d_mri = build_discriminator('D_MRI')\n",
905
+ "\n",
906
+ "# Create optimizers\n",
907
+ "g_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
908
+ "d_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
909
+ "\n",
910
+ "# Initialize model variables\n",
911
+ "g_vars = g_ct_mri.trainable_variables + g_mri_ct.trainable_variables\n",
912
+ "d_vars = d_ct.trainable_variables + d_mri.trainable_variables\n",
913
+ "\n",
914
+ "# Build optimizers\n",
915
+ "g_opt.build(g_vars)\n",
916
+ "d_opt.build(d_vars)"
917
+ ]
918
+ },
919
+ {
920
+ "cell_type": "markdown",
921
+ "metadata": {},
922
+ "source": [
923
+ "*Explanation of train_step Function in CycleGAN with Variational Autoencoder (VAE)*\n",
924
+ "\n",
925
+ "This function performs one training step for the CycleGAN with VAE-style latent representations. It does the following:\n",
926
+ "\n",
927
+ "Generates fake images using the generators.\n",
928
+ "\n",
929
+ "Evaluates the fake and real images using the discriminators.\n",
930
+ "\n",
931
+ "Computes the loss functions for both generators and discriminators.\n",
932
+ "\n",
933
+ "Computes gradients and updates the model parameters.\n",
934
+ "\n",
935
+ "1️⃣ Forward Pass - Generate Fake Images\n",
936
+ "\n",
937
+ " g_ct_mri(real_ct): Translates CT → Fake MRI and produces:\n",
938
+ " \n",
939
+ " fake_mri: The generated MRI image.\n",
940
+ " z_mean_fwd, z_log_var_fwd: Latent variables (from the Variational Autoencoder).\n",
941
+ " g_mri_ct(real_mri): Translates MRI → Fake CT with similar outputs.\n",
942
+ "\n",
943
+ " 📝 Why store z_mean and z_log_var?\n",
944
+ "\n",
945
+ " These come from the VAE latent space and are used for the KL divergence loss.\n",
946
+ "\n",
947
+ "2️⃣ Compute Discriminator Outputs\n",
948
+ "\n",
949
+ " d_ct(real_ct): Discriminator’s prediction for real CT images.\n",
950
+ "\n",
951
+ " d_ct(fake_ct): Discriminator’s prediction for fake CT images.\n",
952
+ "\n",
953
+ " d_mri(real_mri): Discriminator’s prediction for real MRI images.\n",
954
+ "\n",
955
+ " d_mri(fake_mri): Discriminator’s prediction for fake MRI images.\n",
956
+ "\n",
957
+ " 📝 Goal of Discriminators?\n",
958
+ "\n",
959
+ "\n",
960
+ " Real images should be classified close to 1.\n",
961
+ "\n",
962
+ " Fake images should be classified close to 0.\n",
963
+ "\n",
964
+ "3️⃣ Compute Discriminator Losses\n",
965
+ "\n",
966
+ " Uses Least Squares GAN (LSGAN) loss:\n",
967
+ "\n",
968
+ " For real images: (real - 1)^2 → Encourages real images to be classified as 1.\n",
969
+ "\n",
970
+ " For fake images: fake^2 → Encourages fake images to be classified as 0.\n",
971
+ "\n",
972
+ " sum([...]): If there are multiple output layers in the discriminator, we sum their losses.\n",
973
+ "\n",
974
+ " 📝 Why LSGAN loss?\n",
975
+ "\n",
976
+ " Helps stabilize training compared to standard GAN loss.\n",
977
+ "\n",
978
+ "\n",
979
+ "4️⃣ Cycle Consistency Loss (CycleGAN Component)\n",
980
+ "\n",
981
+ " cycled_ct = g_mri_ct(fake_mri): The fake MRI is translated back to CT.\n",
982
+ "\n",
983
+ " cycled_mri = g_ct_mri(fake_ct): The fake CT is translated back to MRI.\n",
984
+ "\n",
985
+ " 📝 Why cycle consistency?\n",
986
+ "\n",
987
+ " The network should learn round-trip consistency:\n",
988
+ "\n",
989
+ " CT → Fake MRI → CT (should look like original CT)\n",
990
+ "\n",
991
+ " MRI → Fake CT → MRI (should look like original MRI)\n",
992
+ "\n",
993
+ "5️⃣ KL Divergence Loss (VAE Component)\n",
994
+ "\n",
995
+ " This is the KL divergence loss from VAE:\n",
996
+ "\n",
997
+ " Encourages the latent space to follow a Gaussian distribution.\n",
998
+ "\n",
999
+ " Prevents mode collapse.\n",
1000
+ "\n",
1001
+ " 📝 Why add KL divergence loss?\n",
1002
+ "\n",
1003
+ " Regularizes the latent space so the generator produces diverse outputs.\n",
1004
+ "\n",
1005
+ "6️⃣ Compute Generator Losses\n",
1006
+ "\n",
1007
+ " The generator wants fake images to be classified as real (1), so we use:\n",
1008
+ "\n",
1009
+ " (fake - 1)^2 → Fake images should be close to 1.\n",
1010
+ "\n",
1011
+ " Cycle consistency loss: L1 loss (|original - reconstructed|).\n",
1012
+ "\n",
1013
+ " Encourages faithful reconstructions.\n",
1014
+ "\n",
1015
+ "\n",
1016
+ " Final generator loss combines:\n",
1017
+ "\n",
1018
+ " Adversarial loss (GAN loss).\n",
1019
+ "\n",
1020
+ " Cycle consistency loss (weighted by 10 for stronger enforcement).\n",
1021
+ "\n",
1022
+ " KL divergence loss (weighted by 0.5).\n",
1023
+ "\n",
1024
+ " \n",
1025
+ "7️⃣ Compute Total Discriminator Loss\n",
1026
+ "\n",
1027
+ "Adds both discriminator losses.\n",
1028
+ "\n",
1029
+ "8️⃣ Compute Gradients & Update Model Parameters\n",
1030
+ "\n",
1031
+ " Computes gradients of discriminator loss (d_total_loss).\n",
1032
+ "\n",
1033
+ " Updates discriminator weights (d_vars).\n",
1034
+ "\n",
1035
+ " Computes gradients of generator loss (g_total_loss).\n",
1036
+ "\n",
1037
+ " Updates generator weights (g_vars).\n",
1038
+ "\n",
1039
+ "📝 Why use tf.GradientTape(persistent=True)?\n",
1040
+ "\n",
1041
+ " We need gradients twice (once for discriminators, once for generators).\n",
1042
+ "\n",
1043
+ "\n"
1044
+ ]
1045
+ },
1046
+ {
1047
+ "cell_type": "code",
1048
+ "execution_count": null,
1049
+ "metadata": {},
1050
+ "outputs": [],
1051
+ "source": [
1052
+ "import os\n",
1053
+ "@tf.function\n",
1054
+ "def train_step(real_ct, real_mri):\n",
1055
+ " with tf.GradientTape(persistent=True) as tape:\n",
1056
+ " # Forward passes\n",
1057
+ " fake_mri, z_mean_fwd, z_log_var_fwd = g_ct_mri(real_ct, training=True)\n",
1058
+ " fake_ct, z_mean_bwd, z_log_var_bwd = g_mri_ct(real_mri, training=True)\n",
1059
+ " \n",
1060
+ " # Discriminator outputs\n",
1061
+ " d_real_ct = d_ct(real_ct, training=True)\n",
1062
+ " d_fake_ct = d_ct(fake_ct, training=True)\n",
1063
+ " d_real_mri = d_mri(real_mri, training=True)\n",
1064
+ " d_fake_mri = d_mri(fake_mri, training=True)\n",
1065
+ " \n",
1066
+ " # Discriminator losses\n",
1067
+ " d_ct_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
1068
+ " for real, fake in zip(d_real_ct, d_fake_ct)])\n",
1069
+ " d_mri_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
1070
+ " for real, fake in zip(d_real_mri, d_fake_mri)])\n",
1071
+ " \n",
1072
+ " # Cycle consistency\n",
1073
+ " cycled_ct, _, _ = g_mri_ct(fake_mri, training=True)\n",
1074
+ " cycled_mri, _, _ = g_ct_mri(fake_ct, training=True)\n",
1075
+ " \n",
1076
+ " # KL Divergence\n",
1077
+ " kl_fwd = -0.5 * tf.reduce_mean(1 + z_log_var_fwd - tf.square(z_mean_fwd) - tf.exp(z_log_var_fwd))\n",
1078
+ " kl_bwd = -0.5 * tf.reduce_mean(1 + z_log_var_bwd - tf.square(z_mean_bwd) - tf.exp(z_log_var_bwd))\n",
1079
+ " \n",
1080
+ " # Generator losses\n",
1081
+ " g_adv_loss = sum([tf.reduce_mean((fake - 1)**2) for fake in d_fake_mri + d_fake_ct])\n",
1082
+ " g_cycle_loss = (tf.reduce_mean(tf.abs(real_ct - cycled_ct)) + \n",
1083
+ " tf.reduce_mean(tf.abs(real_mri - cycled_mri)))\n",
1084
+ " g_total_loss = g_adv_loss + 10 * g_cycle_loss + 0.5 * (kl_fwd + kl_bwd)\n",
1085
+ " \n",
1086
+ " # Total discriminator loss\n",
1087
+ " d_total_loss = d_ct_loss + d_mri_loss\n",
1088
+ " \n",
1089
+ " # Update discriminators\n",
1090
+ " d_grads = tape.gradient(d_total_loss, d_vars)\n",
1091
+ " d_opt.apply_gradients(zip(d_grads, d_vars))\n",
1092
+ " \n",
1093
+ " # Update generators\n",
1094
+ " g_grads = tape.gradient(g_total_loss, g_vars)\n",
1095
+ " g_opt.apply_gradients(zip(g_grads, g_vars))\n",
1096
+ " \n",
1097
+ " return {\n",
1098
+ " 'd_ct': d_ct_loss,\n",
1099
+ " 'd_mri': d_mri_loss,\n",
1100
+ " 'g_total': g_total_loss,\n",
1101
+ " 'fake_mri': fake_mri,\n",
1102
+ " 'fake_ct': fake_ct\n",
1103
+ " }"
1104
+ ]
1105
+ },
1106
+ {
1107
+ "cell_type": "markdown",
1108
+ "metadata": {},
1109
+ "source": [
1110
+ "This code defines the main training loop for a CycleGAN-based model that translates between CT and MRI images. It consists of data preparation, training iteration, progress tracking, and model saving. Below is a step-by-step breakdown:\n",
1111
+ "\n",
1112
+ "\n",
1113
+ "1. Create Progress Directory\n",
1114
+ "\n",
1115
+ "The script creates a directory named progress/ inside Kaggle's working directory.\n",
1116
+ "\n",
1117
+ "This directory will store progress images showing how well the model is learning over time.\n",
1118
+ "\n",
1119
+ "2. Load and Balance the Datasets\n",
1120
+ "\n",
1121
+ "Calls load_and_balance_datasets() to load CT and MRI images from the dataset folders.\n",
1122
+ "\n",
1123
+ "Ensures both datasets have the same number of images by truncating the larger set.\n",
1124
+ "\n",
1125
+ "3. Create TensorFlow Dataset for Training\n",
1126
+ "\n",
1127
+ "Creates a TensorFlow dataset from the loaded images.\n",
1128
+ "\n",
1129
+ "Shuffles the dataset to introduce randomness and prevent overfitting.\n",
1130
+ "\n",
1131
+ "Batches the dataset to process multiple images in parallel during training.\n",
1132
+ "\n",
1133
+ "\n",
1134
+ "4. Training Loop\n",
1135
+ "\n",
1136
+ "Starts iterating over epochs (EPOCHS defines the total number of passes over the dataset).\n",
1137
+ "\n",
1138
+ "Iterates through mini-batches of CT and MRI scans using train_dataset.\n",
1139
+ "\n",
1140
+ "5. Train the Model (Forward & Backward Pass)\n",
1141
+ "\n",
1142
+ "Calls train_step(ct_batch, mri_batch), which:\n",
1143
+ "\n",
1144
+ " Generates fake MRI from CT (G_CT→MRI) and fake CT from MRI (G_MRI→CT).\n",
1145
+ "\n",
1146
+ " Passes both real and fake images through the discriminators (D_CT and D_MRI).\n",
1147
+ "\n",
1148
+ " Computes adversarial losses, cycle consistency loss, and KL divergence.\n",
1149
+ "\n",
1150
+ " Updates the discriminators (D_CT, D_MRI) and generators (G_CT→MRI, G_MRI→CT).\n",
1151
+ "\n",
1152
+ "Stores the loss values (d_ct_loss, d_mri_loss, g_total_loss) and the generated images.\n",
1153
+ "\n",
1154
+ "6. Print Losses for Monitoring\n",
1155
+ "\n",
1156
+ "Every 10 batches, prints:\n",
1157
+ "\n",
1158
+ "D_CT: Discriminator loss for CT.\n",
1159
+ "\n",
1160
+ "D_MRI: Discriminator loss for MRI.\n",
1161
+ "\n",
1162
+ "G: Total generator loss.\n",
1163
+ "\n",
1164
+ "This helps monitor model performance during training.\n",
1165
+ "\n",
1166
+ "7. Save Sample Images for Progress Tracking\n",
1167
+ "\n",
1168
+ "Every 100 batches, saves progress images to progress/.\n",
1169
+ "\n",
1170
+ "Displays real CT & MRI images alongside their fake counterparts generated by the model.\n",
1171
+ "\n",
1172
+ "Helps visually track improvements in image quality over time.\n",
1173
+ "\n"
1174
+ ]
1175
+ },
1176
+ {
1177
+ "cell_type": "code",
1178
+ "execution_count": 14,
1179
+ "metadata": {},
1180
+ "outputs": [
1181
+ {
1182
+ "ename": "NameError",
1183
+ "evalue": "name 'os' is not defined",
1184
+ "output_type": "error",
1185
+ "traceback": [
1186
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1187
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
1188
+ "Cell \u001b[1;32mIn[14], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ===================== Main Training Loop =====================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Create progress directory if it doesn't exist\u001b[39;00m\n\u001b[0;32m 3\u001b[0m progress_dir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/kaggle/working/progress\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(progress_dir):\n\u001b[0;32m 5\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(progress_dir)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m# Load and prepare data\u001b[39;00m\n",
1189
+ "\u001b[1;31mNameError\u001b[0m: name 'os' is not defined"
1190
+ ]
1191
+ }
1192
+ ],
1193
+ "source": [
1194
+ "\n",
1195
+ "# ===================== Main Training Loop =====================\n",
1196
+ "# Create progress directory if it doesn't exist\n",
1197
+ "progress_dir = '/kaggle/working/progress'\n",
1198
+ "if not os.path.exists(progress_dir):\n",
1199
+ " os.makedirs(progress_dir)\n",
1200
+ "\n",
1201
+ "# Load and prepare data\n",
1202
+ "print(\"Loading datasets...\")\n",
1203
+ "ct_scans, mri_scans = load_and_balance_datasets('/kaggle/input/ct-to-mri-cgan/Dataset/images/trainA', \n",
1204
+ " '/kaggle/input/ct-to-mri-cgan/Dataset/images/trainB')\n",
1205
+ "\n",
1206
+ "# Create TensorFlow dataset\n",
1207
+ "train_dataset = tf.data.Dataset.from_tensor_slices((ct_scans, mri_scans))\n",
1208
+ "train_dataset = train_dataset.shuffle(buffer_size=len(ct_scans)).batch(BATCH_SIZE)\n",
1209
+ "# Training loop\n",
1210
+ "print(\"Starting training...\")\n",
1211
+ "for epoch in range(EPOCHS):\n",
1212
+ " for batch_idx, (ct_batch, mri_batch) in enumerate(train_dataset):\n",
1213
+ " results = train_step(ct_batch, mri_batch)\n",
1214
+ " \n",
1215
+ " if batch_idx % 10 == 0:\n",
1216
+ " print(f\"Epoch {epoch}, Batch {batch_idx}: \"\n",
1217
+ " f\"D_CT={float(results['d_ct']):.4f}, \"\n",
1218
+ " f\"D_MRI={float(results['d_mri']):.4f}, \"\n",
1219
+ " f\"G={float(results['g_total']):.4f}\")\n",
1220
+ " \n",
1221
+ " # Save sample images every 100 batches\n",
1222
+ " if batch_idx % 100 == 0:\n",
1223
+ " fig, axes = plt.subplots(2, 2, figsize=(10, 10))\n",
1224
+ " \n",
1225
+ " # Real CT and Fake MRI\n",
1226
+ " axes[0,0].imshow(ct_batch[0].numpy())\n",
1227
+ " axes[0,0].set_title(\"Real CT\")\n",
1228
+ " axes[0,0].axis('off')\n",
1229
+ " \n",
1230
+ " axes[0,1].imshow(results['fake_mri'][0].numpy())\n",
1231
+ " axes[0,1].set_title(\"Fake MRI\")\n",
1232
+ " axes[0,1].axis('off')\n",
1233
+ " \n",
1234
+ " # Real MRI and Fake CT\n",
1235
+ " axes[1,0].imshow(mri_batch[0].numpy())\n",
1236
+ " axes[1,0].set_title(\"Real MRI\")\n",
1237
+ " axes[1,0].axis('off')\n",
1238
+ " \n",
1239
+ " axes[1,1].imshow(results['fake_ct'][0].numpy())\n",
1240
+ " axes[1,1].set_title(\"Fake CT\")\n",
1241
+ " axes[1,1].axis('off')\n",
1242
+ " \n",
1243
+ " plt.tight_layout()\n",
1244
+ " plt.savefig(f'progress/epoch_{epoch}_batch_{batch_idx}.png')\n",
1245
+ " plt.close()\n",
1246
+ " \n",
1247
+ " # Save models after each epoch\n",
1248
+ " save_models(g_ct_mri, g_mri_ct, epoch)"
1249
+ ]
1250
+ },
1251
+ {
1252
+ "cell_type": "code",
1253
+ "execution_count": null,
1254
+ "metadata": {},
1255
+ "outputs": [],
1256
+ "source": [
1257
+ "\n",
1258
+ "def translate_image(model_path, image_path, output_path, mode='ct_to_mri'):\n",
1259
+ " \"\"\"\n",
1260
+ " Translate a single image using the trained model\n",
1261
+ " \n",
1262
+ " Parameters:\n",
1263
+ " model_path: Path to the saved model\n",
1264
+ " image_path: Path to the input image\n",
1265
+ " output_path: Path to save the translated image\n",
1266
+ " mode: 'ct_to_mri' or 'mri_to_ct'\n",
1267
+ " \"\"\"\n",
1268
+ " # Load model\n",
1269
+ " print(f\"Loading model from {model_path}\")\n",
1270
+ " model = tf.keras.models.load_model(model_path, \n",
1271
+ " custom_objects={'Sampling': Sampling})\n",
1272
+ " \n",
1273
+ " # Load and preprocess image\n",
1274
+ " input_image = load_and_preprocess_image(image_path)\n",
1275
+ " \n",
1276
+ " # Generate translation\n",
1277
+ " print(\"Generating translation...\")\n",
1278
+ " translated_image, _, _ = model(input_image, training=False)\n",
1279
+ " \n",
1280
+ " # Convert to numpy and denormalize\n",
1281
+ " translated_image = translated_image.numpy()[0] * 255\n",
1282
+ " translated_image = translated_image.astype(np.uint8)\n",
1283
+ " \n",
1284
+ " # Save the result\n",
1285
+ " print(f\"Saving translated image to {output_path}\")\n",
1286
+ " plt.figure(figsize=(10, 5))\n",
1287
+ " \n",
1288
+ " plt.subplot(1, 2, 1)\n",
1289
+ " plt.title(\"Input Image\")\n",
1290
+ " plt.imshow(input_image[0])\n",
1291
+ " plt.axis('off')\n",
1292
+ " \n",
1293
+ " plt.subplot(1, 2, 2)\n",
1294
+ " plt.title(\"Translated Image\")\n",
1295
+ " plt.imshow(translated_image)\n",
1296
+ " plt.axis('off')\n",
1297
+ " \n",
1298
+ " plt.tight_layout()\n",
1299
+ " plt.savefig(output_path)\n",
1300
+ " plt.close()\n",
1301
+ " \n",
1302
+ " return translated_image\n",
1303
+ "'''\n",
1304
+ "# Example usage of the translation function\n",
1305
+ "def example_translation():\n",
1306
+ " \"\"\"Example of how to use the translation function\"\"\"\n",
1307
+ " # Paths\n",
1308
+ " ct_to_mri_model = 'saved_models/ct_to_mri_epoch_1000'\n",
1309
+ " mri_to_ct_model = 'saved_models/mri_to_ct_epoch_1000'\n",
1310
+ " \n",
1311
+ " # CT to MRI translation\n",
1312
+ " input_ct = 'path/to/your/ct_image.jpg'\n",
1313
+ " output_mri = 'results/translated_mri.png'\n",
1314
+ " translated_mri = translate_image(ct_to_mri_model, input_ct, output_mri, \n",
1315
+ " mode='ct_to_mri')\n",
1316
+ " \n",
1317
+ " # MRI to CT translation\n",
1318
+ " input_mri = 'path/to/your/mri_image.jpg'\n",
1319
+ " output_ct = 'results/translated_ct.png'\n",
1320
+ " translated_ct = translate_image(mri_to_ct_model, input_mri, output_ct, \n",
1321
+ " mode='mri_to_ct')'''"
1322
+ ]
1323
+ },
1324
+ {
1325
+ "cell_type": "markdown",
1326
+ "metadata": {},
1327
+ "source": [
1328
+ "*Complete code in Single Block*"
1329
+ ]
1330
+ },
1331
+ {
1332
+ "cell_type": "code",
1333
+ "execution_count": null,
1334
+ "metadata": {},
1335
+ "outputs": [],
1336
+ "source": [
1337
+ "import tensorflow as tf\n",
1338
+ "from tensorflow.keras import layers, Model\n",
1339
+ "import numpy as np\n",
1340
+ "import cv2\n",
1341
+ "import pathlib\n",
1342
+ "import matplotlib.pyplot as plt\n",
1343
+ "import tensorflow_probability as tfp\n",
1344
+ "\n",
1345
+ "tfd = tfp.distributions\n",
1346
+ "\n",
1347
+ "# ===================== Configuration =====================\n",
1348
+ "IMAGE_SHAPE = (256, 256, 3)\n",
1349
+ "LATENT_DIM = 256\n",
1350
+ "FILTERS = 16\n",
1351
+ "KERNEL = 3\n",
1352
+ "LEARNING_RATE = 0.0001\n",
1353
+ "WEIGHT_DECAY = 6e-8\n",
1354
+ "BATCH_SIZE = 1\n",
1355
+ "EPOCHS = 10\n",
1356
+ "\n",
1357
+ "# ===================== Architecture Components =====================\n",
1358
+ "class Sampling(layers.Layer):\n",
1359
+ " def call(self, inputs):\n",
1360
+ " z_mean, z_log_var = inputs\n",
1361
+ " batch = tf.shape(z_mean)[0]\n",
1362
+ " dim = tf.shape(z_mean)[1]\n",
1363
+ " epsilon = tf.random.normal(shape=(batch, dim))\n",
1364
+ " return z_mean + tf.exp(0.5 * z_log_var) * epsilon\n",
1365
+ "\n",
1366
+ "def residual_block(inputs, filters, use_norm=True):\n",
1367
+ " x = layers.Conv2D(filters, KERNEL, padding='same')(inputs)\n",
1368
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
1369
+ " if use_norm:\n",
1370
+ " x = layers.GroupNormalization(groups=1)(x)\n",
1371
+ " x = layers.Conv2D(filters, KERNEL, padding='same')(x)\n",
1372
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
1373
+ " if use_norm:\n",
1374
+ " x = layers.GroupNormalization(groups=1)(x)\n",
1375
+ " shortcut = layers.Conv2D(filters, 1, padding='same')(inputs)\n",
1376
+ " return layers.maximum([x, shortcut])\n",
1377
+ "\n",
1378
+ "def encoder_block(inputs, filters, use_norm=True):\n",
1379
+ " x = residual_block(inputs, filters, use_norm)\n",
1380
+ " skip = x\n",
1381
+ " x = layers.MaxPooling2D()(x)\n",
1382
+ " return x, skip\n",
1383
+ "\n",
1384
+ "def decoder_block(inputs, skip, filters, use_norm=True):\n",
1385
+ " x = layers.Conv2DTranspose(filters, KERNEL, strides=2, padding='same')(inputs)\n",
1386
+ " x = layers.maximum([x, skip])\n",
1387
+ " x = residual_block(x, filters, use_norm)\n",
1388
+ " return x\n",
1389
+ "\n",
1390
+ "# ===================== Generator =====================\n",
1391
+ "def build_generator(name):\n",
1392
+ " inputs = layers.Input(IMAGE_SHAPE)\n",
1393
+ " \n",
1394
+ " # Encoder\n",
1395
+ " e1, s1 = encoder_block(inputs, FILTERS)\n",
1396
+ " e2, s2 = encoder_block(e1, FILTERS*2)\n",
1397
+ " e3, s3 = encoder_block(e2, FILTERS*4)\n",
1398
+ " e4, s4 = encoder_block(e3, FILTERS*8)\n",
1399
+ " e5, s5 = encoder_block(e4, FILTERS*16)\n",
1400
+ " e6, s6 = encoder_block(e5, FILTERS*32)\n",
1401
+ " e7, s7 = encoder_block(e6, FILTERS*64)\n",
1402
+ " \n",
1403
+ " # Latent Space\n",
1404
+ " x = layers.Flatten()(e7)\n",
1405
+ " z_mean = layers.Dense(LATENT_DIM, name=f\"z_mean_{name.split('_')[-1]}\")(x)\n",
1406
+ " z_log_var = layers.Dense(LATENT_DIM, name=f\"z_log_var_{name.split('_')[-1]}\")(x)\n",
1407
+ " z = Sampling()([z_mean, z_log_var])\n",
1408
+ " \n",
1409
+ " # Reshape for decoder\n",
1410
+ " x = layers.Dense(2 * 2 * FILTERS*64)(z)\n",
1411
+ " x = layers.Reshape((2, 2, FILTERS*64))(x)\n",
1412
+ " \n",
1413
+ " # Decoder\n",
1414
+ " d0 = decoder_block(x, s7, FILTERS*64)\n",
1415
+ " d1 = decoder_block(d0, s6, FILTERS*32)\n",
1416
+ " d2 = decoder_block(d1, s5, FILTERS*16)\n",
1417
+ " d3 = decoder_block(d2, s4, FILTERS*8)\n",
1418
+ " d4 = decoder_block(d3, s3, FILTERS*4)\n",
1419
+ " d5 = decoder_block(d4, s2, FILTERS*2)\n",
1420
+ " d6 = decoder_block(d5, s1, FILTERS)\n",
1421
+ " \n",
1422
+ " outputs = layers.Conv2D(3, KERNEL, activation='sigmoid', padding='same')(d6)\n",
1423
+ " return Model(inputs, [outputs, z_mean, z_log_var], name=name)\n",
1424
+ "\n",
1425
+ "# ===================== Discriminator =====================\n",
1426
+ "def build_discriminator(name):\n",
1427
+ " inputs = layers.Input(IMAGE_SHAPE)\n",
1428
+ " \n",
1429
+ " # Feature extraction\n",
1430
+ " x = inputs\n",
1431
+ " features = []\n",
1432
+ " \n",
1433
+ " # Initial convolution\n",
1434
+ " x = layers.Conv2D(FILTERS, KERNEL, padding='same')(x)\n",
1435
+ " x = layers.LeakyReLU(alpha=0.2)(x)\n",
1436
+ " features.append(x)\n",
1437
+ " \n",
1438
+ " # Downsampling blocks\n",
1439
+ " filter_sizes = [FILTERS*2, FILTERS*4, FILTERS*8, FILTERS*16, FILTERS*32, FILTERS*64]\n",
1440
+ " for filters in filter_sizes:\n",
1441
+ " x, _ = encoder_block(x, filters, use_norm=False)\n",
1442
+ " features.append(x)\n",
1443
+ " \n",
1444
+ " # Multi-scale outputs\n",
1445
+ " outputs = []\n",
1446
+ " for i, feature in enumerate(features[-4:]):\n",
1447
+ " out = layers.Conv2D(1, KERNEL, padding='same')(feature)\n",
1448
+ " outputs.append(out)\n",
1449
+ " \n",
1450
+ " return Model(inputs, outputs, name=name)\n",
1451
+ "\n",
1452
+ "# ===================== Data Loading =====================\n",
1453
+ "def load_images(path):\n",
1454
+ " images = []\n",
1455
+ " for p in pathlib.Path(path).glob('*.*'):\n",
1456
+ " try:\n",
1457
+ " img = cv2.imread(str(p))\n",
1458
+ " if img is not None:\n",
1459
+ " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
1460
+ " img = cv2.resize(img, IMAGE_SHAPE[:2])\n",
1461
+ " img = img.astype(np.float32) / 255.0\n",
1462
+ " images.append(img)\n",
1463
+ " except Exception as e:\n",
1464
+ " print(f\"Error loading image {p}: {e}\")\n",
1465
+ " return np.array(images)\n",
1466
+ "\n",
1467
+ "def load_and_balance_datasets(ct_path, mri_path):\n",
1468
+ " print(\"Loading CT scans...\")\n",
1469
+ " ct_scans = load_images(ct_path)\n",
1470
+ " print(\"Loading MRI scans...\")\n",
1471
+ " mri_scans = load_images(mri_path)\n",
1472
+ " \n",
1473
+ " min_length = min(len(ct_scans), len(mri_scans))\n",
1474
+ " ct_scans = ct_scans[:min_length]\n",
1475
+ " mri_scans = mri_scans[:min_length]\n",
1476
+ " \n",
1477
+ " print(f\"Balanced datasets to {min_length} images each\")\n",
1478
+ " return ct_scans, mri_scans\n",
1479
+ "\n",
1480
+ "# ===================== Training Setup =====================\n",
1481
+ "# Build models\n",
1482
+ "g_ct_mri = build_generator('CT_to_MRI')\n",
1483
+ "g_mri_ct = build_generator('MRI_to_CT')\n",
1484
+ "d_ct = build_discriminator('D_CT')\n",
1485
+ "d_mri = build_discriminator('D_MRI')\n",
1486
+ "\n",
1487
+ "# Create optimizers\n",
1488
+ "g_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
1489
+ "d_opt = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
1490
+ "\n",
1491
+ "# Initialize model variables\n",
1492
+ "g_vars = g_ct_mri.trainable_variables + g_mri_ct.trainable_variables\n",
1493
+ "d_vars = d_ct.trainable_variables + d_mri.trainable_variables\n",
1494
+ "\n",
1495
+ "# Build optimizers\n",
1496
+ "g_opt.build(g_vars)\n",
1497
+ "d_opt.build(d_vars)\n",
1498
+ "\n",
1499
+ "# ===================== Training Function =====================\n",
1500
+ "@tf.function\n",
1501
+ "def train_step(real_ct, real_mri):\n",
1502
+ " with tf.GradientTape(persistent=True) as tape:\n",
1503
+ " # Forward passes\n",
1504
+ " fake_mri, z_mean_fwd, z_log_var_fwd = g_ct_mri(real_ct, training=True)\n",
1505
+ " fake_ct, z_mean_bwd, z_log_var_bwd = g_mri_ct(real_mri, training=True)\n",
1506
+ " \n",
1507
+ " # Discriminator outputs\n",
1508
+ " d_real_ct = d_ct(real_ct, training=True)\n",
1509
+ " d_fake_ct = d_ct(fake_ct, training=True)\n",
1510
+ " d_real_mri = d_mri(real_mri, training=True)\n",
1511
+ " d_fake_mri = d_mri(fake_mri, training=True)\n",
1512
+ " \n",
1513
+ " # Discriminator losses\n",
1514
+ " d_ct_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
1515
+ " for real, fake in zip(d_real_ct, d_fake_ct)])\n",
1516
+ " d_mri_loss = sum([tf.reduce_mean((real - 1)**2) + tf.reduce_mean(fake**2) \n",
1517
+ " for real, fake in zip(d_real_mri, d_fake_mri)])\n",
1518
+ " \n",
1519
+ " # Cycle consistency\n",
1520
+ " cycled_ct, _, _ = g_mri_ct(fake_mri, training=True)\n",
1521
+ " cycled_mri, _, _ = g_ct_mri(fake_ct, training=True)\n",
1522
+ " \n",
1523
+ " # KL Divergence\n",
1524
+ " kl_fwd = -0.5 * tf.reduce_mean(1 + z_log_var_fwd - tf.square(z_mean_fwd) - tf.exp(z_log_var_fwd))\n",
1525
+ " kl_bwd = -0.5 * tf.reduce_mean(1 + z_log_var_bwd - tf.square(z_mean_bwd) - tf.exp(z_log_var_bwd))\n",
1526
+ " \n",
1527
+ " # Generator losses\n",
1528
+ " g_adv_loss = sum([tf.reduce_mean((fake - 1)**2) for fake in d_fake_mri + d_fake_ct])\n",
1529
+ " g_cycle_loss = (tf.reduce_mean(tf.abs(real_ct - cycled_ct)) + \n",
1530
+ " tf.reduce_mean(tf.abs(real_mri - cycled_mri)))\n",
1531
+ " g_total_loss = g_adv_loss + 10 * g_cycle_loss + 0.5 * (kl_fwd + kl_bwd)\n",
1532
+ " \n",
1533
+ " # Total discriminator loss\n",
1534
+ " d_total_loss = d_ct_loss + d_mri_loss\n",
1535
+ " \n",
1536
+ " # Update discriminators\n",
1537
+ " d_grads = tape.gradient(d_total_loss, d_vars)\n",
1538
+ " d_opt.apply_gradients(zip(d_grads, d_vars))\n",
1539
+ " \n",
1540
+ " # Update generators\n",
1541
+ " g_grads = tape.gradient(g_total_loss, g_vars)\n",
1542
+ " g_opt.apply_gradients(zip(g_grads, g_vars))\n",
1543
+ " \n",
1544
+ " return {\n",
1545
+ " 'd_ct': d_ct_loss,\n",
1546
+ " 'd_mri': d_mri_loss,\n",
1547
+ " 'g_total': g_total_loss,\n",
1548
+ " 'fake_mri': fake_mri,\n",
1549
+ " 'fake_ct': fake_ct\n",
1550
+ " }\n",
1551
+ "\n",
1552
+ "\n",
1553
+ "\n",
1554
+ "import os\n",
1555
+ "\n",
1556
+ "def save_models(g_ct_mri, g_mri_ct, epoch, model_dir='/kaggle/working/saved_models'):\n",
1557
+ " \"\"\"Save models in HDF5 format after each epoch\"\"\"\n",
1558
+ " if not os.path.exists(model_dir):\n",
1559
+ " os.makedirs(model_dir)\n",
1560
+ " \n",
1561
+ " # Save as .h5 files\n",
1562
+ " ct_path = os.path.join(model_dir, f'ct_to_mri_epoch_{epoch}.h5')\n",
1563
+ " mri_path = os.path.join(model_dir, f'mri_to_ct_epoch_{epoch}.h5')\n",
1564
+ " \n",
1565
+ " g_ct_mri.save(ct_path)\n",
1566
+ " g_mri_ct.save(mri_path)\n",
1567
+ " print(f\"Models saved: {ct_path} and {mri_path}\")\n",
1568
+ "\n",
1569
+ "\n",
1570
+ "# ===================== Main Training Loop =====================\n",
1571
+ "# Create progress directory if it doesn't exist\n",
1572
+ "progress_dir = '/kaggle/working/progress'\n",
1573
+ "if not os.path.exists(progress_dir):\n",
1574
+ " os.makedirs(progress_dir)\n",
1575
+ "\n",
1576
+ "# Load and prepare data\n",
1577
+ "print(\"Loading datasets...\")\n",
1578
+ "ct_scans, mri_scans = load_and_balance_datasets('/kaggle/input/ct-to-mri-cgan/Dataset/images/trainA', \n",
1579
+ " '/kaggle/input/ct-to-mri-cgan/Dataset/images/trainB')\n",
1580
+ "\n",
1581
+ "# Create TensorFlow dataset\n",
1582
+ "train_dataset = tf.data.Dataset.from_tensor_slices((ct_scans, mri_scans))\n",
1583
+ "train_dataset = train_dataset.shuffle(buffer_size=len(ct_scans)).batch(BATCH_SIZE)\n",
1584
+ "# Training loop\n",
1585
+ "print(\"Starting training...\")\n",
1586
+ "for epoch in range(EPOCHS):\n",
1587
+ " for batch_idx, (ct_batch, mri_batch) in enumerate(train_dataset):\n",
1588
+ " results = train_step(ct_batch, mri_batch)\n",
1589
+ " \n",
1590
+ " if batch_idx % 10 == 0:\n",
1591
+ " print(f\"Epoch {epoch}, Batch {batch_idx}: \"\n",
1592
+ " f\"D_CT={float(results['d_ct']):.4f}, \"\n",
1593
+ " f\"D_MRI={float(results['d_mri']):.4f}, \"\n",
1594
+ " f\"G={float(results['g_total']):.4f}\")\n",
1595
+ " \n",
1596
+ " # Save sample images every 100 batches\n",
1597
+ " if batch_idx % 100 == 0:\n",
1598
+ " fig, axes = plt.subplots(2, 2, figsize=(10, 10))\n",
1599
+ " \n",
1600
+ " # Real CT and Fake MRI\n",
1601
+ " axes[0,0].imshow(ct_batch[0].numpy())\n",
1602
+ " axes[0,0].set_title(\"Real CT\")\n",
1603
+ " axes[0,0].axis('off')\n",
1604
+ " \n",
1605
+ " axes[0,1].imshow(results['fake_mri'][0].numpy())\n",
1606
+ " axes[0,1].set_title(\"Fake MRI\")\n",
1607
+ " axes[0,1].axis('off')\n",
1608
+ " \n",
1609
+ " # Real MRI and Fake CT\n",
1610
+ " axes[1,0].imshow(mri_batch[0].numpy())\n",
1611
+ " axes[1,0].set_title(\"Real MRI\")\n",
1612
+ " axes[1,0].axis('off')\n",
1613
+ " \n",
1614
+ " axes[1,1].imshow(results['fake_ct'][0].numpy())\n",
1615
+ " axes[1,1].set_title(\"Fake CT\")\n",
1616
+ " axes[1,1].axis('off')\n",
1617
+ " \n",
1618
+ " plt.tight_layout()\n",
1619
+ " plt.savefig(f'progress/epoch_{epoch}_batch_{batch_idx}.png')\n",
1620
+ " plt.close()\n",
1621
+ " \n",
1622
+ " # Save models after each epoch\n",
1623
+ " save_models(g_ct_mri, g_mri_ct, epoch)\n",
1624
+ "\n",
1625
+ "def load_and_preprocess_image(image_path):\n",
1626
+ " \"\"\"Load and preprocess a single image for inference\"\"\"\n",
1627
+ " # Read image\n",
1628
+ " img = cv2.imread(image_path)\n",
1629
+ " if img is None:\n",
1630
+ " raise ValueError(f\"Could not load image from {image_path}\")\n",
1631
+ " \n",
1632
+ " # Convert BGR to RGB\n",
1633
+ " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
1634
+ " \n",
1635
+ " # Resize to model's input size\n",
1636
+ " img = cv2.resize(img, (256, 256))\n",
1637
+ " \n",
1638
+ " # Normalize to [0, 1]\n",
1639
+ " img = img.astype(np.float32) / 255.0\n",
1640
+ " \n",
1641
+ " # Add batch dimension\n",
1642
+ " img = np.expand_dims(img, axis=0)\n",
1643
+ " \n",
1644
+ " return img\n",
1645
+ "\n",
1646
+ "def translate_image(model_path, image_path, output_path, mode='ct_to_mri'):\n",
1647
+ " \"\"\"\n",
1648
+ " Translate a single image using the trained model\n",
1649
+ " \n",
1650
+ " Parameters:\n",
1651
+ " model_path: Path to the saved model\n",
1652
+ " image_path: Path to the input image\n",
1653
+ " output_path: Path to save the translated image\n",
1654
+ " mode: 'ct_to_mri' or 'mri_to_ct'\n",
1655
+ " \"\"\"\n",
1656
+ " # Load model\n",
1657
+ " print(f\"Loading model from {model_path}\")\n",
1658
+ " model = tf.keras.models.load_model(model_path, \n",
1659
+ " custom_objects={'Sampling': Sampling})\n",
1660
+ " \n",
1661
+ " # Load and preprocess image\n",
1662
+ " input_image = load_and_preprocess_image(image_path)\n",
1663
+ " \n",
1664
+ " # Generate translation\n",
1665
+ " print(\"Generating translation...\")\n",
1666
+ " translated_image, _, _ = model(input_image, training=False)\n",
1667
+ " \n",
1668
+ " # Convert to numpy and denormalize\n",
1669
+ " translated_image = translated_image.numpy()[0] * 255\n",
1670
+ " translated_image = translated_image.astype(np.uint8)\n",
1671
+ " \n",
1672
+ " # Save the result\n",
1673
+ " print(f\"Saving translated image to {output_path}\")\n",
1674
+ " plt.figure(figsize=(10, 5))\n",
1675
+ " \n",
1676
+ " plt.subplot(1, 2, 1)\n",
1677
+ " plt.title(\"Input Image\")\n",
1678
+ " plt.imshow(input_image[0])\n",
1679
+ " plt.axis('off')\n",
1680
+ " \n",
1681
+ " plt.subplot(1, 2, 2)\n",
1682
+ " plt.title(\"Translated Image\")\n",
1683
+ " plt.imshow(translated_image)\n",
1684
+ " plt.axis('off')\n",
1685
+ " \n",
1686
+ " plt.tight_layout()\n",
1687
+ " plt.savefig(output_path)\n",
1688
+ " plt.close()\n",
1689
+ " \n",
1690
+ " return translated_image\n",
1691
+ "'''\n",
1692
+ "# Example usage of the translation function\n",
1693
+ "def example_translation():\n",
1694
+ " \"\"\"Example of how to use the translation function\"\"\"\n",
1695
+ " # Paths\n",
1696
+ " ct_to_mri_model = 'saved_models/ct_to_mri_epoch_1000'\n",
1697
+ " mri_to_ct_model = 'saved_models/mri_to_ct_epoch_1000'\n",
1698
+ " \n",
1699
+ " # CT to MRI translation\n",
1700
+ " input_ct = 'path/to/your/ct_image.jpg'\n",
1701
+ " output_mri = 'results/translated_mri.png'\n",
1702
+ " translated_mri = translate_image(ct_to_mri_model, input_ct, output_mri, \n",
1703
+ " mode='ct_to_mri')\n",
1704
+ " \n",
1705
+ " # MRI to CT translation\n",
1706
+ " input_mri = 'path/to/your/mri_image.jpg'\n",
1707
+ " output_ct = 'results/translated_ct.png'\n",
1708
+ " translated_ct = translate_image(mri_to_ct_model, input_mri, output_ct, \n",
1709
+ " mode='mri_to_ct')'''"
1710
+ ]
1711
+ },
1712
+ {
1713
+ "cell_type": "markdown",
1714
+ "metadata": {},
1715
+ "source": []
1716
+ },
1717
+ {
1718
+ "cell_type": "markdown",
1719
+ "metadata": {},
1720
+ "source": []
1721
+ },
1722
+ {
1723
+ "cell_type": "code",
1724
+ "execution_count": null,
1725
+ "metadata": {},
1726
+ "outputs": [],
1727
+ "source": []
1728
+ },
1729
+ {
1730
+ "cell_type": "code",
1731
+ "execution_count": null,
1732
+ "metadata": {},
1733
+ "outputs": [],
1734
+ "source": []
1735
+ }
1736
+ ],
1737
+ "metadata": {
1738
+ "kernelspec": {
1739
+ "display_name": "image",
1740
+ "language": "python",
1741
+ "name": "python3"
1742
+ },
1743
+ "language_info": {
1744
+ "codemirror_mode": {
1745
+ "name": "ipython",
1746
+ "version": 3
1747
+ },
1748
+ "file_extension": ".py",
1749
+ "mimetype": "text/x-python",
1750
+ "name": "python",
1751
+ "nbconvert_exporter": "python",
1752
+ "pygments_lexer": "ipython3",
1753
+ "version": "3.10.11"
1754
+ }
1755
+ },
1756
+ "nbformat": 4,
1757
+ "nbformat_minor": 2
1758
+ }
demo.html ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Simple CSS Page</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 0;
12
+ text-align: center;
13
+ background-color: #f4f4f4;
14
+ }
15
+
16
+ header {
17
+ background-color: #3498db;
18
+ color: white;
19
+ padding: 20px;
20
+ font-size: 24px;
21
+
22
+ }
23
+
24
+ section {
25
+ margin: 20px auto;
26
+ padding: 20px;
27
+ background-color: white;
28
+ border-radius: 10px;
29
+
30
+ max-width: 600px;
31
+ }
32
+
33
+ button {
34
+ background-color: #2ecc71;
35
+ color: white;
36
+ padding: 10px 20px;
37
+ border: none;
38
+ border-radius: 5px;
39
+ cursor: pointer;
40
+ font-size: 16px;
41
+ transition: background-color 0.3s ease;
42
+ }
43
+
44
+ button:hover {
45
+ background-color: #27ae60;
46
+ }
47
+
48
+ footer {
49
+ background-color: #333;
50
+ color: white;
51
+ padding: 10px;
52
+ margin-top: 20px;
53
+ font-size: 14px;
54
+ }
55
+ </style>
56
+ </head>
57
+ <body>
58
+ <header>Welcome to My Simple CSS Page</header>
59
+ <section>
60
+ <h2>Styled Elements</h2>
61
+ <p>This is a simple webpage demonstrating basic CSS styling.</p>
62
+ <button>Click Me</button>
63
+ </section>
64
+ <footer>&copy; 2025 My Simple Page. All rights reserved.</footer>
65
+ </body>
66
+ </html>
env/Lib/site-packages/PyYAML-6.0.2.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
env/Lib/site-packages/PyYAML-6.0.2.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017-2021 Ingy döt Net
2
+ Copyright (c) 2006-2016 Kirill Simonov
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ this software and associated documentation files (the "Software"), to deal in
6
+ the Software without restriction, including without limitation the rights to
7
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ of the Software, and to permit persons to whom the Software is furnished to do
9
+ so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all
12
+ copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
env/Lib/site-packages/PyYAML-6.0.2.dist-info/METADATA ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: PyYAML
3
+ Version: 6.0.2
4
+ Summary: YAML parser and emitter for Python
5
+ Home-page: https://pyyaml.org/
6
+ Download-URL: https://pypi.org/project/PyYAML/
7
+ Author: Kirill Simonov
8
+ Author-email: [email protected]
9
+ License: MIT
10
+ Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
11
+ Project-URL: CI, https://github.com/yaml/pyyaml/actions
12
+ Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
13
+ Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
14
+ Project-URL: Source Code, https://github.com/yaml/pyyaml
15
+ Platform: Any
16
+ Classifier: Development Status :: 5 - Production/Stable
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Cython
21
+ Classifier: Programming Language :: Python
22
+ Classifier: Programming Language :: Python :: 3
23
+ Classifier: Programming Language :: Python :: 3.8
24
+ Classifier: Programming Language :: Python :: 3.9
25
+ Classifier: Programming Language :: Python :: 3.10
26
+ Classifier: Programming Language :: Python :: 3.11
27
+ Classifier: Programming Language :: Python :: 3.12
28
+ Classifier: Programming Language :: Python :: 3.13
29
+ Classifier: Programming Language :: Python :: Implementation :: CPython
30
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
31
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
32
+ Classifier: Topic :: Text Processing :: Markup
33
+ Requires-Python: >=3.8
34
+ License-File: LICENSE
35
+
36
+ YAML is a data serialization format designed for human readability
37
+ and interaction with scripting languages. PyYAML is a YAML parser
38
+ and emitter for Python.
39
+
40
+ PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
41
+ support, capable extension API, and sensible error messages. PyYAML
42
+ supports standard YAML tags and provides Python-specific tags that
43
+ allow to represent an arbitrary Python object.
44
+
45
+ PyYAML is applicable for a broad range of tasks from complex
46
+ configuration files to object serialization and persistence.
env/Lib/site-packages/PyYAML-6.0.2.dist-info/RECORD ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
3
+ PyYAML-6.0.2.dist-info/METADATA,sha256=9lwXqTOrXPts-jI2Lo5UwuaAYo0hiRA0BZqjch0WjAk,2106
4
+ PyYAML-6.0.2.dist-info/RECORD,,
5
+ PyYAML-6.0.2.dist-info/WHEEL,sha256=c7SWG1_hRvc9HXHEkmWlTu1Jr4WpzRucfzqTP-_8q0s,102
6
+ PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
7
+ _yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
8
+ _yaml/__pycache__/__init__.cpython-312.pyc,,
9
+ yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
10
+ yaml/__pycache__/__init__.cpython-312.pyc,,
11
+ yaml/__pycache__/composer.cpython-312.pyc,,
12
+ yaml/__pycache__/constructor.cpython-312.pyc,,
13
+ yaml/__pycache__/cyaml.cpython-312.pyc,,
14
+ yaml/__pycache__/dumper.cpython-312.pyc,,
15
+ yaml/__pycache__/emitter.cpython-312.pyc,,
16
+ yaml/__pycache__/error.cpython-312.pyc,,
17
+ yaml/__pycache__/events.cpython-312.pyc,,
18
+ yaml/__pycache__/loader.cpython-312.pyc,,
19
+ yaml/__pycache__/nodes.cpython-312.pyc,,
20
+ yaml/__pycache__/parser.cpython-312.pyc,,
21
+ yaml/__pycache__/reader.cpython-312.pyc,,
22
+ yaml/__pycache__/representer.cpython-312.pyc,,
23
+ yaml/__pycache__/resolver.cpython-312.pyc,,
24
+ yaml/__pycache__/scanner.cpython-312.pyc,,
25
+ yaml/__pycache__/serializer.cpython-312.pyc,,
26
+ yaml/__pycache__/tokens.cpython-312.pyc,,
27
+ yaml/_yaml.cp312-win_amd64.pyd,sha256=Bx7e_LEQx7cnd1_A9_nClp3X77g-_Lw1aoAAtYZbwWk,263680
28
+ yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
29
+ yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
30
+ yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
31
+ yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
32
+ yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
33
+ yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
34
+ yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
35
+ yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
36
+ yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
37
+ yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
38
+ yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
39
+ yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
40
+ yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
41
+ yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
42
+ yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
43
+ yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573
env/Lib/site-packages/PyYAML-6.0.2.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.44.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-win_amd64
5
+
env/Lib/site-packages/PyYAML-6.0.2.dist-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _yaml
2
+ yaml
env/Lib/site-packages/_yaml/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a stub package designed to roughly emulate the _yaml
2
+ # extension module, which previously existed as a standalone module
3
+ # and has been moved into the `yaml` package namespace.
4
+ # It does not perfectly mimic its old counterpart, but should get
5
+ # close enough for anyone who's relying on it even when they shouldn't.
6
+ import yaml
7
+
8
+ # in some circumstances, the yaml module we imoprted may be from a different version, so we need
9
+ # to tread carefully when poking at it here (it may not have the attributes we expect)
10
+ if not getattr(yaml, '__with_libyaml__', False):
11
+ from sys import version_info
12
+
13
+ exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
14
+ raise exc("No module named '_yaml'")
15
+ else:
16
+ from yaml._yaml import *
17
+ import warnings
18
+ warnings.warn(
19
+ 'The _yaml extension module is now located at yaml._yaml'
20
+ ' and its location is subject to change. To use the'
21
+ ' LibYAML-based parser and emitter, import from `yaml`:'
22
+ ' `from yaml import CLoader as Loader, CDumper as Dumper`.',
23
+ DeprecationWarning
24
+ )
25
+ del warnings
26
+ # Don't `del yaml` here because yaml is actually an existing
27
+ # namespace member of _yaml.
28
+
29
+ __name__ = '_yaml'
30
+ # If the module is top-level (i.e. not a part of any specific package)
31
+ # then the attribute should be set to ''.
32
+ # https://docs.python.org/3.8/library/types.html
33
+ __package__ = ''
env/Lib/site-packages/certifi-2025.1.31.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
env/Lib/site-packages/certifi-2025.1.31.dist-info/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This package contains a modified version of ca-bundle.crt:
2
+
3
+ ca-bundle.crt -- Bundle of CA Root Certificates
4
+
5
+ This is a bundle of X.509 certificates of public Certificate Authorities
6
+ (CA). These were automatically extracted from Mozilla's root certificates
7
+ file (certdata.txt). This file can be found in the mozilla source tree:
8
+ https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
9
+ It contains the certificates in PEM format and therefore
10
+ can be directly used with curl / libcurl / php_curl, or with
11
+ an Apache+mod_ssl webserver for SSL client authentication.
12
+ Just configure this file as the SSLCACertificateFile.#
13
+
14
+ ***** BEGIN LICENSE BLOCK *****
15
+ This Source Code Form is subject to the terms of the Mozilla Public License,
16
+ v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
17
+ one at http://mozilla.org/MPL/2.0/.
18
+
19
+ ***** END LICENSE BLOCK *****
20
+ @(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
env/Lib/site-packages/certifi-2025.1.31.dist-info/METADATA ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: certifi
3
+ Version: 2025.1.31
4
+ Summary: Python package for providing Mozilla's CA Bundle.
5
+ Home-page: https://github.com/certifi/python-certifi
6
+ Author: Kenneth Reitz
7
+ Author-email: [email protected]
8
+ License: MPL-2.0
9
+ Project-URL: Source, https://github.com/certifi/python-certifi
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
13
+ Classifier: Natural Language :: English
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Requires-Python: >=3.6
26
+ License-File: LICENSE
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: home-page
32
+ Dynamic: license
33
+ Dynamic: project-url
34
+ Dynamic: requires-python
35
+ Dynamic: summary
36
+
37
+ Certifi: Python SSL Certificates
38
+ ================================
39
+
40
+ Certifi provides Mozilla's carefully curated collection of Root Certificates for
41
+ validating the trustworthiness of SSL certificates while verifying the identity
42
+ of TLS hosts. It has been extracted from the `Requests`_ project.
43
+
44
+ Installation
45
+ ------------
46
+
47
+ ``certifi`` is available on PyPI. Simply install it with ``pip``::
48
+
49
+ $ pip install certifi
50
+
51
+ Usage
52
+ -----
53
+
54
+ To reference the installed certificate authority (CA) bundle, you can use the
55
+ built-in function::
56
+
57
+ >>> import certifi
58
+
59
+ >>> certifi.where()
60
+ '/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
61
+
62
+ Or from the command line::
63
+
64
+ $ python -m certifi
65
+ /usr/local/lib/python3.7/site-packages/certifi/cacert.pem
66
+
67
+ Enjoy!
68
+
69
+ .. _`Requests`: https://requests.readthedocs.io/en/master/
70
+
71
+ Addition/Removal of Certificates
72
+ --------------------------------
73
+
74
+ Certifi does not support any addition/removal or other modification of the
75
+ CA trust store content. This project is intended to provide a reliable and
76
+ highly portable root of trust to python deployments. Look to upstream projects
77
+ for methods to use alternate trust.
env/Lib/site-packages/certifi-2025.1.31.dist-info/RECORD ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
3
+ certifi-2025.1.31.dist-info/METADATA,sha256=t5kcT5aGu0dQ6_psUNZYTqnC0uCRnponewm3uYjeHbg,2451
4
+ certifi-2025.1.31.dist-info/RECORD,,
5
+ certifi-2025.1.31.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
+ certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
7
+ certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
8
+ certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
9
+ certifi/__pycache__/__init__.cpython-312.pyc,,
10
+ certifi/__pycache__/__main__.cpython-312.pyc,,
11
+ certifi/__pycache__/core.cpython-312.pyc,,
12
+ certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
13
+ certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
14
+ certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
env/Lib/site-packages/certifi-2025.1.31.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
env/Lib/site-packages/certifi-2025.1.31.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ certifi
env/Lib/site-packages/certifi/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .core import contents, where
2
+
3
+ __all__ = ["contents", "where"]
4
+ __version__ = "2025.01.31"
env/Lib/site-packages/certifi/__main__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from certifi import contents, where
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("-c", "--contents", action="store_true")
7
+ args = parser.parse_args()
8
+
9
+ if args.contents:
10
+ print(contents())
11
+ else:
12
+ print(where())
env/Lib/site-packages/certifi/cacert.pem ADDED
The diff for this file is too large to render. See raw diff
 
env/Lib/site-packages/certifi/core.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ certifi.py
3
+ ~~~~~~~~~~
4
+
5
+ This module returns the installation location of cacert.pem or its contents.
6
+ """
7
+ import sys
8
+ import atexit
9
+
10
+ def exit_cacert_ctx() -> None:
11
+ _CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
12
+
13
+
14
+ if sys.version_info >= (3, 11):
15
+
16
+ from importlib.resources import as_file, files
17
+
18
+ _CACERT_CTX = None
19
+ _CACERT_PATH = None
20
+
21
+ def where() -> str:
22
+ # This is slightly terrible, but we want to delay extracting the file
23
+ # in cases where we're inside of a zipimport situation until someone
24
+ # actually calls where(), but we don't want to re-extract the file
25
+ # on every call of where(), so we'll do it once then store it in a
26
+ # global variable.
27
+ global _CACERT_CTX
28
+ global _CACERT_PATH
29
+ if _CACERT_PATH is None:
30
+ # This is slightly janky, the importlib.resources API wants you to
31
+ # manage the cleanup of this file, so it doesn't actually return a
32
+ # path, it returns a context manager that will give you the path
33
+ # when you enter it and will do any cleanup when you leave it. In
34
+ # the common case of not needing a temporary file, it will just
35
+ # return the file system location and the __exit__() is a no-op.
36
+ #
37
+ # We also have to hold onto the actual context manager, because
38
+ # it will do the cleanup whenever it gets garbage collected, so
39
+ # we will also store that at the global level as well.
40
+ _CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
41
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
42
+ atexit.register(exit_cacert_ctx)
43
+
44
+ return _CACERT_PATH
45
+
46
+ def contents() -> str:
47
+ return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
48
+
49
+ elif sys.version_info >= (3, 7):
50
+
51
+ from importlib.resources import path as get_path, read_text
52
+
53
+ _CACERT_CTX = None
54
+ _CACERT_PATH = None
55
+
56
+ def where() -> str:
57
+ # This is slightly terrible, but we want to delay extracting the
58
+ # file in cases where we're inside of a zipimport situation until
59
+ # someone actually calls where(), but we don't want to re-extract
60
+ # the file on every call of where(), so we'll do it once then store
61
+ # it in a global variable.
62
+ global _CACERT_CTX
63
+ global _CACERT_PATH
64
+ if _CACERT_PATH is None:
65
+ # This is slightly janky, the importlib.resources API wants you
66
+ # to manage the cleanup of this file, so it doesn't actually
67
+ # return a path, it returns a context manager that will give
68
+ # you the path when you enter it and will do any cleanup when
69
+ # you leave it. In the common case of not needing a temporary
70
+ # file, it will just return the file system location and the
71
+ # __exit__() is a no-op.
72
+ #
73
+ # We also have to hold onto the actual context manager, because
74
+ # it will do the cleanup whenever it gets garbage collected, so
75
+ # we will also store that at the global level as well.
76
+ _CACERT_CTX = get_path("certifi", "cacert.pem")
77
+ _CACERT_PATH = str(_CACERT_CTX.__enter__())
78
+ atexit.register(exit_cacert_ctx)
79
+
80
+ return _CACERT_PATH
81
+
82
+ def contents() -> str:
83
+ return read_text("certifi", "cacert.pem", encoding="ascii")
84
+
85
+ else:
86
+ import os
87
+ import types
88
+ from typing import Union
89
+
90
+ Package = Union[types.ModuleType, str]
91
+ Resource = Union[str, "os.PathLike"]
92
+
93
+ # This fallback will work for Python versions prior to 3.7 that lack the
94
+ # importlib.resources module but relies on the existing `where` function
95
+ # so won't address issues with environments like PyOxidizer that don't set
96
+ # __file__ on modules.
97
+ def read_text(
98
+ package: Package,
99
+ resource: Resource,
100
+ encoding: str = 'utf-8',
101
+ errors: str = 'strict'
102
+ ) -> str:
103
+ with open(where(), encoding=encoding) as data:
104
+ return data.read()
105
+
106
+ # If we don't have importlib.resources, then we will just do the old logic
107
+ # of assuming we're on the filesystem and munge the path directly.
108
+ def where() -> str:
109
+ f = os.path.dirname(__file__)
110
+
111
+ return os.path.join(f, "cacert.pem")
112
+
113
+ def contents() -> str:
114
+ return read_text("certifi", "cacert.pem", encoding="ascii")
env/Lib/site-packages/certifi/py.typed ADDED
File without changes
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TAHRI Ahmed R.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/METADATA ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: charset-normalizer
3
+ Version: 3.4.1
4
+ Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
5
+ Author-email: "Ahmed R. TAHRI" <[email protected]>
6
+ Maintainer-email: "Ahmed R. TAHRI" <[email protected]>
7
+ License: MIT
8
+ Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
9
+ Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
10
+ Project-URL: Code, https://github.com/jawah/charset_normalizer
11
+ Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
12
+ Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Programming Language :: Python :: 3 :: Only
27
+ Classifier: Programming Language :: Python :: Implementation :: CPython
28
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
29
+ Classifier: Topic :: Text Processing :: Linguistic
30
+ Classifier: Topic :: Utilities
31
+ Classifier: Typing :: Typed
32
+ Requires-Python: >=3.7
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Provides-Extra: unicode-backport
36
+
37
+ <h1 align="center">Charset Detection, for Everyone 👋</h1>
38
+
39
+ <p align="center">
40
+ <sup>The Real First Universal Charset Detector</sup><br>
41
+ <a href="https://pypi.org/project/charset-normalizer">
42
+ <img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
43
+ </a>
44
+ <a href="https://pepy.tech/project/charset-normalizer/">
45
+ <img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
46
+ </a>
47
+ <a href="https://bestpractices.coreinfrastructure.org/projects/7297">
48
+ <img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
49
+ </a>
50
+ </p>
51
+ <p align="center">
52
+ <sup><i>Featured Packages</i></sup><br>
53
+ <a href="https://github.com/jawah/niquests">
54
+ <img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
55
+ </a>
56
+ <a href="https://github.com/jawah/wassima">
57
+ <img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
58
+ </a>
59
+ </p>
60
+ <p align="center">
61
+ <sup><i>In other language (unofficial port - by the community)</i></sup><br>
62
+ <a href="https://github.com/nickspring/charset-normalizer-rs">
63
+ <img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
64
+ </a>
65
+ </p>
66
+
67
+ > A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
68
+ > I'm trying to resolve the issue by taking a new approach.
69
+ > All IANA character set names for which the Python core library provides codecs are supported.
70
+
71
+ <p align="center">
72
+ >>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
73
+ </p>
74
+
75
+ This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
76
+
77
+ | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
78
+ |--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
79
+ | `Fast` | ❌ | ✅ | ✅ |
80
+ | `Universal**` | ❌ | ✅ | ❌ |
81
+ | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
82
+ | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
83
+ | `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
84
+ | `Native Python` | ✅ | ✅ | ❌ |
85
+ | `Detect spoken language` | ❌ | ✅ | N/A |
86
+ | `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
87
+ | `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
88
+ | `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
89
+
90
+ <p align="center">
91
+ <img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
92
+ </p>
93
+
94
+ *\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
95
+
96
+ ## ⚡ Performance
97
+
98
+ This package offer better performance than its counterpart Chardet. Here are some numbers.
99
+
100
+ | Package | Accuracy | Mean per file (ms) | File per sec (est) |
101
+ |-----------------------------------------------|:--------:|:------------------:|:------------------:|
102
+ | [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
103
+ | charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
104
+
105
+ | Package | 99th percentile | 95th percentile | 50th percentile |
106
+ |-----------------------------------------------|:---------------:|:---------------:|:---------------:|
107
+ | [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
108
+ | charset-normalizer | 100 ms | 50 ms | 5 ms |
109
+
110
+ _updated as of december 2024 using CPython 3.12_
111
+
112
+ Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
113
+
114
+ > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
115
+ > And yes, these results might change at any time. The dataset can be updated to include more files.
116
+ > The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
117
+ > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
118
+ > (e.g. Supported Encoding) Challenge-them if you want.
119
+
120
+ ## ✨ Installation
121
+
122
+ Using pip:
123
+
124
+ ```sh
125
+ pip install charset-normalizer -U
126
+ ```
127
+
128
+ ## 🚀 Basic Usage
129
+
130
+ ### CLI
131
+ This package comes with a CLI.
132
+
133
+ ```
134
+ usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
135
+ file [file ...]
136
+
137
+ The Real First Universal Charset Detector. Discover originating encoding used
138
+ on text file. Normalize text to unicode.
139
+
140
+ positional arguments:
141
+ files File(s) to be analysed
142
+
143
+ optional arguments:
144
+ -h, --help show this help message and exit
145
+ -v, --verbose Display complementary information about file if any.
146
+ Stdout will contain logs about the detection process.
147
+ -a, --with-alternative
148
+ Output complementary possibilities if any. Top-level
149
+ JSON WILL be a list.
150
+ -n, --normalize Permit to normalize input file. If not set, program
151
+ does not write anything.
152
+ -m, --minimal Only output the charset detected to STDOUT. Disabling
153
+ JSON output.
154
+ -r, --replace Replace file when trying to normalize it instead of
155
+ creating a new one.
156
+ -f, --force Replace file without asking if you are sure, use this
157
+ flag with caution.
158
+ -t THRESHOLD, --threshold THRESHOLD
159
+ Define a custom maximum amount of chaos allowed in
160
+ decoded content. 0. <= chaos <= 1.
161
+ --version Show version information and exit.
162
+ ```
163
+
164
+ ```bash
165
+ normalizer ./data/sample.1.fr.srt
166
+ ```
167
+
168
+ or
169
+
170
+ ```bash
171
+ python -m charset_normalizer ./data/sample.1.fr.srt
172
+ ```
173
+
174
+ 🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
175
+
176
+ ```json
177
+ {
178
+ "path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
179
+ "encoding": "cp1252",
180
+ "encoding_aliases": [
181
+ "1252",
182
+ "windows_1252"
183
+ ],
184
+ "alternative_encodings": [
185
+ "cp1254",
186
+ "cp1256",
187
+ "cp1258",
188
+ "iso8859_14",
189
+ "iso8859_15",
190
+ "iso8859_16",
191
+ "iso8859_3",
192
+ "iso8859_9",
193
+ "latin_1",
194
+ "mbcs"
195
+ ],
196
+ "language": "French",
197
+ "alphabets": [
198
+ "Basic Latin",
199
+ "Latin-1 Supplement"
200
+ ],
201
+ "has_sig_or_bom": false,
202
+ "chaos": 0.149,
203
+ "coherence": 97.152,
204
+ "unicode_path": null,
205
+ "is_preferred": true
206
+ }
207
+ ```
208
+
209
+ ### Python
210
+ *Just print out normalized text*
211
+ ```python
212
+ from charset_normalizer import from_path
213
+
214
+ results = from_path('./my_subtitle.srt')
215
+
216
+ print(str(results.best()))
217
+ ```
218
+
219
+ *Upgrade your code without effort*
220
+ ```python
221
+ from charset_normalizer import detect
222
+ ```
223
+
224
+ The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
225
+
226
+ See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
227
+
228
+ ## 😇 Why
229
+
230
+ When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
231
+ reliable alternative using a completely different method. Also! I never back down on a good challenge!
232
+
233
+ I **don't care** about the **originating charset** encoding, because **two different tables** can
234
+ produce **two identical rendered string.**
235
+ What I want is to get readable text, the best I can.
236
+
237
+ In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
238
+
239
+ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
240
+
241
+ ## 🍰 How
242
+
243
+ - Discard all charset encoding table that could not fit the binary content.
244
+ - Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
245
+ - Extract matches with the lowest mess detected.
246
+ - Additionally, we measure coherence / probe for a language.
247
+
248
+ **Wait a minute**, what is noise/mess and coherence according to **YOU ?**
249
+
250
+ *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
251
+ **I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
252
+ I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
253
+ improve or rewrite it.
254
+
255
+ *Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
256
+ that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
257
+
258
+ ## ⚡ Known limitations
259
+
260
+ - Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
261
+ - Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
262
+
263
+ ## ⚠️ About Python EOLs
264
+
265
+ **If you are running:**
266
+
267
+ - Python >=2.7,<3.5: Unsupported
268
+ - Python 3.5: charset-normalizer < 2.1
269
+ - Python 3.6: charset-normalizer < 3.1
270
+ - Python 3.7: charset-normalizer < 4.0
271
+
272
+ Upgrade your Python interpreter as soon as possible.
273
+
274
+ ## 👤 Contributing
275
+
276
+ Contributions, issues and feature requests are very much welcome.<br />
277
+ Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
278
+
279
+ ## 📝 License
280
+
281
+ Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
282
+ This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
283
+
284
+ Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
285
+
286
+ ## 💼 For Enterprise
287
+
288
+ Professional support for charset-normalizer is available as part of the [Tidelift
289
+ Subscription][1]. Tidelift gives software development teams a single source for
290
+ purchasing and maintaining their software, with professional grade assurances
291
+ from the experts who know it best, while seamlessly integrating with existing
292
+ tools.
293
+
294
+ [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
295
+
296
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
297
+
298
+ # Changelog
299
+ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
300
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
301
+
302
+ ## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
303
+
304
+ ### Changed
305
+ - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
306
+ - Enforce annotation delayed loading for a simpler and consistent types in the project.
307
+ - Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
308
+
309
+ ### Added
310
+ - pre-commit configuration.
311
+ - noxfile.
312
+
313
+ ### Removed
314
+ - `build-requirements.txt` as per using `pyproject.toml` native build configuration.
315
+ - `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
316
+ - `setup.cfg` in favor of `pyproject.toml` metadata configuration.
317
+ - Unused `utils.range_scan` function.
318
+
319
+ ### Fixed
320
+ - Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
321
+ - Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
322
+
323
+ ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
324
+
325
+ ### Added
326
+ - Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
327
+ - Support for Python 3.13 (#512)
328
+
329
+ ### Fixed
330
+ - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
331
+ - Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
332
+ - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
333
+
334
+ ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
335
+
336
+ ### Fixed
337
+ - Unintentional memory usage regression when using large payload that match several encoding (#376)
338
+ - Regression on some detection case showcased in the documentation (#371)
339
+
340
+ ### Added
341
+ - Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
342
+
343
+ ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
344
+
345
+ ### Changed
346
+ - Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
347
+ - Improved the general detection reliability based on reports from the community
348
+
349
+ ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
350
+
351
+ ### Added
352
+ - Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
353
+ - Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
354
+
355
+ ### Removed
356
+ - (internal) Redundant utils.is_ascii function and unused function is_private_use_only
357
+ - (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
358
+
359
+ ### Changed
360
+ - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
361
+ - Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
362
+
363
+ ### Fixed
364
+ - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
365
+
366
+ ## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
367
+
368
+ ### Changed
369
+ - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
370
+ - Minor improvement over the global detection reliability
371
+
372
+ ### Added
373
+ - Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
374
+ - Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
375
+ - Explicit support for Python 3.12
376
+
377
+ ### Fixed
378
+ - Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
379
+
380
+ ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
381
+
382
+ ### Added
383
+ - Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
384
+
385
+ ### Removed
386
+ - Support for Python 3.6 (PR #260)
387
+
388
+ ### Changed
389
+ - Optional speedup provided by mypy/c 1.0.1
390
+
391
+ ## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
392
+
393
+ ### Fixed
394
+ - Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
395
+
396
+ ### Changed
397
+ - Speedup provided by mypy/c 0.990 on Python >= 3.7
398
+
399
+ ## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
400
+
401
+ ### Added
402
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
403
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
404
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
405
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
406
+
407
+ ### Changed
408
+ - Build with static metadata using 'build' frontend
409
+ - Make the language detection stricter
410
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
411
+
412
+ ### Fixed
413
+ - CLI with opt --normalize fail when using full path for files
414
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
415
+ - Sphinx warnings when generating the documentation
416
+
417
+ ### Removed
418
+ - Coherence detector no longer return 'Simple English' instead return 'English'
419
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
420
+ - Breaking: Method `first()` and `best()` from CharsetMatch
421
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
422
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
423
+ - Breaking: Top-level function `normalize`
424
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
425
+ - Support for the backport `unicodedata2`
426
+
427
+ ## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
428
+
429
+ ### Added
430
+ - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
431
+ - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
432
+ - Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
433
+
434
+ ### Changed
435
+ - Build with static metadata using 'build' frontend
436
+ - Make the language detection stricter
437
+
438
+ ### Fixed
439
+ - CLI with opt --normalize fail when using full path for files
440
+ - TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
441
+
442
+ ### Removed
443
+ - Coherence detector no longer return 'Simple English' instead return 'English'
444
+ - Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
445
+
446
+ ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
447
+
448
+ ### Added
449
+ - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
450
+
451
+ ### Removed
452
+ - Breaking: Method `first()` and `best()` from CharsetMatch
453
+ - UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
454
+
455
+ ### Fixed
456
+ - Sphinx warnings when generating the documentation
457
+
458
+ ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
459
+
460
+ ### Changed
461
+ - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
462
+
463
+ ### Removed
464
+ - Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
465
+ - Breaking: Top-level function `normalize`
466
+ - Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
467
+ - Support for the backport `unicodedata2`
468
+
469
+ ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
470
+
471
+ ### Deprecated
472
+ - Function `normalize` scheduled for removal in 3.0
473
+
474
+ ### Changed
475
+ - Removed useless call to decode in fn is_unprintable (#206)
476
+
477
+ ### Fixed
478
+ - Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
479
+
480
+ ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
481
+
482
+ ### Added
483
+ - Output the Unicode table version when running the CLI with `--version` (PR #194)
484
+
485
+ ### Changed
486
+ - Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
487
+ - Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
488
+
489
+ ### Fixed
490
+ - Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
491
+ - CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
492
+
493
+ ### Removed
494
+ - Support for Python 3.5 (PR #192)
495
+
496
+ ### Deprecated
497
+ - Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
498
+
499
+ ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
500
+
501
+ ### Fixed
502
+ - ASCII miss-detection on rare cases (PR #170)
503
+
504
+ ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
505
+
506
+ ### Added
507
+ - Explicit support for Python 3.11 (PR #164)
508
+
509
+ ### Changed
510
+ - The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
511
+
512
+ ## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
513
+
514
+ ### Fixed
515
+ - Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
516
+
517
+ ### Changed
518
+ - Skipping the language-detection (CD) on ASCII (PR #155)
519
+
520
+ ## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
521
+
522
+ ### Changed
523
+ - Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
524
+
525
+ ### Fixed
526
+ - Wrong logging level applied when setting kwarg `explain` to True (PR #146)
527
+
528
+ ## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
529
+ ### Changed
530
+ - Improvement over Vietnamese detection (PR #126)
531
+ - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
532
+ - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
533
+ - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
534
+ - Code style as refactored by Sourcery-AI (PR #131)
535
+ - Minor adjustment on the MD around european words (PR #133)
536
+ - Remove and replace SRTs from assets / tests (PR #139)
537
+ - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
538
+ - Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
539
+
540
+ ### Fixed
541
+ - Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
542
+ - Avoid using too insignificant chunk (PR #137)
543
+
544
+ ### Added
545
+ - Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
546
+ - Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
547
+
548
+ ## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
549
+ ### Added
550
+ - Add support for Kazakh (Cyrillic) language detection (PR #109)
551
+
552
+ ### Changed
553
+ - Further, improve inferring the language from a given single-byte code page (PR #112)
554
+ - Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
555
+ - Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
556
+ - Various detection improvement (MD+CD) (PR #117)
557
+
558
+ ### Removed
559
+ - Remove redundant logging entry about detected language(s) (PR #115)
560
+
561
+ ### Fixed
562
+ - Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
563
+
564
+ ## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
565
+ ### Fixed
566
+ - Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
567
+ - Fix CLI crash when using --minimal output in certain cases (PR #103)
568
+
569
+ ### Changed
570
+ - Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
571
+
572
+ ## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
573
+ ### Changed
574
+ - The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
575
+ - The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
576
+ - The Unicode detection is slightly improved (PR #93)
577
+ - Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
578
+
579
+ ### Removed
580
+ - The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
581
+
582
+ ### Fixed
583
+ - In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
584
+ - Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
585
+ - The MANIFEST.in was not exhaustive (PR #78)
586
+
587
+ ## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
588
+ ### Fixed
589
+ - The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
590
+ - Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
591
+ - The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
592
+ - Submatch factoring could be wrong in rare edge cases (PR #72)
593
+ - Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
594
+ - Fix line endings from CRLF to LF for certain project files (PR #67)
595
+
596
+ ### Changed
597
+ - Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
598
+ - Allow fallback on specified encoding if any (PR #71)
599
+
600
+ ## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
601
+ ### Changed
602
+ - Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
603
+ - According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
604
+
605
+ ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
606
+ ### Fixed
607
+ - Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
608
+
609
+ ### Changed
610
+ - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
611
+
612
+ ## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
613
+ ### Fixed
614
+ - Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
615
+ - Using explain=False permanently disable the verbose output in the current runtime (PR #47)
616
+ - One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
617
+ - Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
618
+
619
+ ### Changed
620
+ - Public function normalize default args values were not aligned with from_bytes (PR #53)
621
+
622
+ ### Added
623
+ - You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
624
+
625
+ ## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
626
+ ### Changed
627
+ - 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
628
+ - Accent has been made on UTF-8 detection, should perform rather instantaneous.
629
+ - The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
630
+ - The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
631
+ - The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
632
+ - utf_7 detection has been reinstated.
633
+
634
+ ### Removed
635
+ - This package no longer require anything when used with Python 3.5 (Dropped cached_property)
636
+ - Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
637
+ - The exception hook on UnicodeDecodeError has been removed.
638
+
639
+ ### Deprecated
640
+ - Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
641
+
642
+ ### Fixed
643
+ - The CLI output used the relative path of the file(s). Should be absolute.
644
+
645
+ ## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
646
+ ### Fixed
647
+ - Logger configuration/usage no longer conflict with others (PR #44)
648
+
649
+ ## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
650
+ ### Removed
651
+ - Using standard logging instead of using the package loguru.
652
+ - Dropping nose test framework in favor of the maintained pytest.
653
+ - Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
654
+ - Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
655
+ - Stop support for UTF-7 that does not contain a SIG.
656
+ - Dropping PrettyTable, replaced with pure JSON output in CLI.
657
+
658
+ ### Fixed
659
+ - BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
660
+ - Not searching properly for the BOM when trying utf32/16 parent codec.
661
+
662
+ ### Changed
663
+ - Improving the package final size by compressing frequencies.json.
664
+ - Huge improvement over the larges payload.
665
+
666
+ ### Added
667
+ - CLI now produces JSON consumable output.
668
+ - Return ASCII if given sequences fit. Given reasonable confidence.
669
+
670
+ ## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
671
+
672
+ ### Fixed
673
+ - In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
674
+
675
+ ## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
676
+
677
+ ### Fixed
678
+ - Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
679
+
680
+ ## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
681
+
682
+ ### Fixed
683
+ - The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
684
+
685
+ ## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
686
+
687
+ ### Changed
688
+ - Amend the previous release to allow prettytable 2.0 (PR #35)
689
+
690
+ ## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
691
+
692
+ ### Fixed
693
+ - Fix error while using the package with a python pre-release interpreter (PR #33)
694
+
695
+ ### Changed
696
+ - Dependencies refactoring, constraints revised.
697
+
698
+ ### Added
699
+ - Add python 3.9 and 3.10 to the supported interpreters
700
+
701
+ MIT License
702
+
703
+ Copyright (c) 2025 TAHRI Ahmed R.
704
+
705
+ Permission is hereby granted, free of charge, to any person obtaining a copy
706
+ of this software and associated documentation files (the "Software"), to deal
707
+ in the Software without restriction, including without limitation the rights
708
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
709
+ copies of the Software, and to permit persons to whom the Software is
710
+ furnished to do so, subject to the following conditions:
711
+
712
+ The above copyright notice and this permission notice shall be included in all
713
+ copies or substantial portions of the Software.
714
+
715
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
716
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
717
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
718
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
719
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
720
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
721
+ SOFTWARE.
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/RECORD ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../Scripts/normalizer.exe,sha256=aGyf7WAVLi4gHrr8F-d9-4fQG9ifpfMEXEvLwyt8KjI,108411
2
+ charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
3
+ charset_normalizer-3.4.1.dist-info/LICENSE,sha256=GFd0hdNwTxpHne2OVzwJds_tMV_S_ReYP6mI2kwvcNE,1092
4
+ charset_normalizer-3.4.1.dist-info/METADATA,sha256=0_fAC3DknimRZusm6kkP4ylPD0JVzBq5mKHWLNBJM6w,36034
5
+ charset_normalizer-3.4.1.dist-info/RECORD,,
6
+ charset_normalizer-3.4.1.dist-info/WHEEL,sha256=pWXrJbnZSH-J-PhYmKs2XNn4DHCPNBYq965vsBJBFvA,101
7
+ charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
8
+ charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
9
+ charset_normalizer/__init__.py,sha256=0NT8MHi7SKq3juMqYfOdrkzjisK0L73lneNHH4qaUAs,1638
10
+ charset_normalizer/__main__.py,sha256=2sj_BS6H0sU25C1bMqz9DVwa6kOK9lchSEbSU-_iu7M,115
11
+ charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
12
+ charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
13
+ charset_normalizer/__pycache__/api.cpython-312.pyc,,
14
+ charset_normalizer/__pycache__/cd.cpython-312.pyc,,
15
+ charset_normalizer/__pycache__/constant.cpython-312.pyc,,
16
+ charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
17
+ charset_normalizer/__pycache__/md.cpython-312.pyc,,
18
+ charset_normalizer/__pycache__/models.cpython-312.pyc,,
19
+ charset_normalizer/__pycache__/utils.cpython-312.pyc,,
20
+ charset_normalizer/__pycache__/version.cpython-312.pyc,,
21
+ charset_normalizer/api.py,sha256=2a0p2Gnhbdo9O6C04CNxTSN23fIbgOF20nxb0pWPNFM,23285
22
+ charset_normalizer/cd.py,sha256=uq8nVxRpR6Guc16ACvOWtL8KO3w7vYaCh8hHisuOyTg,12917
23
+ charset_normalizer/cli/__init__.py,sha256=d9MUx-1V_qD3x9igIy4JT4oC5CU0yjulk7QyZWeRFhg,144
24
+ charset_normalizer/cli/__main__.py,sha256=lZ89qRWun7FRxX0qm1GhK-m0DH0i048yiMAX1mVIuRg,10731
25
+ charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
26
+ charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
27
+ charset_normalizer/constant.py,sha256=7OKYi28cJjZxIcX3lQCwfK9ijoOgaVEbERww7SqqNSY,42475
28
+ charset_normalizer/legacy.py,sha256=v8An1aAQHUu036UWOhyIaDGkirZ0t4hfNVlyje5KInU,2394
29
+ charset_normalizer/md.cp312-win_amd64.pyd,sha256=XBGy--IKda7c3iBfvw_dovocqb2RSucmVtxvtlG_3tA,10752
30
+ charset_normalizer/md.py,sha256=e452fhwIAguEUr3FJzG7QZvFgXI-dVLOh_M1ZUiFI6U,20666
31
+ charset_normalizer/md__mypyc.cp312-win_amd64.pyd,sha256=_-jWSji0BgBVvrIHbmabYQNMBF4-xTusdO5mu6P8JsA,125440
32
+ charset_normalizer/models.py,sha256=ZR2PE-fqf6dASZfqdE5Uhkmr0o1MciSdXOjuNqwkmvg,12754
33
+ charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ charset_normalizer/utils.py,sha256=oH9Q3WcAMwmsSB7uM8uDozz9DXnkYecbkTNbdnMbgzI,12410
35
+ charset_normalizer/version.py,sha256=7_thI7FzRQxEsbtUYwrJs3FCFWF666mw74H8mggPRR0,123
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.6.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-win_amd64
5
+
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ normalizer = charset_normalizer:cli.cli_detect
env/Lib/site-packages/charset_normalizer-3.4.1.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ charset_normalizer
env/Lib/site-packages/charset_normalizer/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charset-Normalizer
3
+ ~~~~~~~~~~~~~~
4
+ The Real First Universal Charset Detector.
5
+ A library that helps you read text from an unknown charset encoding.
6
+ Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7
+ All IANA character set names for which the Python core library provides codecs are supported.
8
+
9
+ Basic usage:
10
+ >>> from charset_normalizer import from_bytes
11
+ >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12
+ >>> best_guess = results.best()
13
+ >>> str(best_guess)
14
+ 'Bсеки човек има право на образование. Oбразованието!'
15
+
16
+ Others methods and usages are available - see the full documentation
17
+ at <https://github.com/Ousret/charset_normalizer>.
18
+ :copyright: (c) 2021 by Ahmed TAHRI
19
+ :license: MIT, see LICENSE for more details.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+
26
+ from .api import from_bytes, from_fp, from_path, is_binary
27
+ from .legacy import detect
28
+ from .models import CharsetMatch, CharsetMatches
29
+ from .utils import set_logging_handler
30
+ from .version import VERSION, __version__
31
+
32
+ __all__ = (
33
+ "from_fp",
34
+ "from_path",
35
+ "from_bytes",
36
+ "is_binary",
37
+ "detect",
38
+ "CharsetMatch",
39
+ "CharsetMatches",
40
+ "__version__",
41
+ "VERSION",
42
+ "set_logging_handler",
43
+ )
44
+
45
+ # Attach a NullHandler to the top level logger by default
46
+ # https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47
+
48
+ logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
env/Lib/site-packages/charset_normalizer/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import cli_detect
4
+
5
+ if __name__ == "__main__":
6
+ cli_detect()
env/Lib/site-packages/charset_normalizer/api.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import PathLike
5
+ from typing import BinaryIO
6
+
7
+ from .cd import (
8
+ coherence_ratio,
9
+ encoding_languages,
10
+ mb_encoding_languages,
11
+ merge_coherence_ratios,
12
+ )
13
+ from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
14
+ from .md import mess_ratio
15
+ from .models import CharsetMatch, CharsetMatches
16
+ from .utils import (
17
+ any_specified_encoding,
18
+ cut_sequence_chunks,
19
+ iana_name,
20
+ identify_sig_or_bom,
21
+ is_cp_similar,
22
+ is_multi_byte_encoding,
23
+ should_strip_sig_or_bom,
24
+ )
25
+
26
+ logger = logging.getLogger("charset_normalizer")
27
+ explain_handler = logging.StreamHandler()
28
+ explain_handler.setFormatter(
29
+ logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
30
+ )
31
+
32
+
33
+ def from_bytes(
34
+ sequences: bytes | bytearray,
35
+ steps: int = 5,
36
+ chunk_size: int = 512,
37
+ threshold: float = 0.2,
38
+ cp_isolation: list[str] | None = None,
39
+ cp_exclusion: list[str] | None = None,
40
+ preemptive_behaviour: bool = True,
41
+ explain: bool = False,
42
+ language_threshold: float = 0.1,
43
+ enable_fallback: bool = True,
44
+ ) -> CharsetMatches:
45
+ """
46
+ Given a raw bytes sequence, return the best possibles charset usable to render str objects.
47
+ If there is no results, it is a strong indicator that the source is binary/not text.
48
+ By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
49
+ And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
50
+
51
+ The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
52
+ but never take it for granted. Can improve the performance.
53
+
54
+ You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
55
+ purpose.
56
+
57
+ This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
58
+ By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
59
+ toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
60
+ Custom logging format and handler can be set manually.
61
+ """
62
+
63
+ if not isinstance(sequences, (bytearray, bytes)):
64
+ raise TypeError(
65
+ "Expected object of type bytes or bytearray, got: {}".format(
66
+ type(sequences)
67
+ )
68
+ )
69
+
70
+ if explain:
71
+ previous_logger_level: int = logger.level
72
+ logger.addHandler(explain_handler)
73
+ logger.setLevel(TRACE)
74
+
75
+ length: int = len(sequences)
76
+
77
+ if length == 0:
78
+ logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
79
+ if explain: # Defensive: ensure exit path clean handler
80
+ logger.removeHandler(explain_handler)
81
+ logger.setLevel(previous_logger_level or logging.WARNING)
82
+ return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
83
+
84
+ if cp_isolation is not None:
85
+ logger.log(
86
+ TRACE,
87
+ "cp_isolation is set. use this flag for debugging purpose. "
88
+ "limited list of encoding allowed : %s.",
89
+ ", ".join(cp_isolation),
90
+ )
91
+ cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
92
+ else:
93
+ cp_isolation = []
94
+
95
+ if cp_exclusion is not None:
96
+ logger.log(
97
+ TRACE,
98
+ "cp_exclusion is set. use this flag for debugging purpose. "
99
+ "limited list of encoding excluded : %s.",
100
+ ", ".join(cp_exclusion),
101
+ )
102
+ cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
103
+ else:
104
+ cp_exclusion = []
105
+
106
+ if length <= (chunk_size * steps):
107
+ logger.log(
108
+ TRACE,
109
+ "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
110
+ steps,
111
+ chunk_size,
112
+ length,
113
+ )
114
+ steps = 1
115
+ chunk_size = length
116
+
117
+ if steps > 1 and length / steps < chunk_size:
118
+ chunk_size = int(length / steps)
119
+
120
+ is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
121
+ is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
122
+
123
+ if is_too_small_sequence:
124
+ logger.log(
125
+ TRACE,
126
+ "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
127
+ length
128
+ ),
129
+ )
130
+ elif is_too_large_sequence:
131
+ logger.log(
132
+ TRACE,
133
+ "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
134
+ length
135
+ ),
136
+ )
137
+
138
+ prioritized_encodings: list[str] = []
139
+
140
+ specified_encoding: str | None = (
141
+ any_specified_encoding(sequences) if preemptive_behaviour else None
142
+ )
143
+
144
+ if specified_encoding is not None:
145
+ prioritized_encodings.append(specified_encoding)
146
+ logger.log(
147
+ TRACE,
148
+ "Detected declarative mark in sequence. Priority +1 given for %s.",
149
+ specified_encoding,
150
+ )
151
+
152
+ tested: set[str] = set()
153
+ tested_but_hard_failure: list[str] = []
154
+ tested_but_soft_failure: list[str] = []
155
+
156
+ fallback_ascii: CharsetMatch | None = None
157
+ fallback_u8: CharsetMatch | None = None
158
+ fallback_specified: CharsetMatch | None = None
159
+
160
+ results: CharsetMatches = CharsetMatches()
161
+
162
+ early_stop_results: CharsetMatches = CharsetMatches()
163
+
164
+ sig_encoding, sig_payload = identify_sig_or_bom(sequences)
165
+
166
+ if sig_encoding is not None:
167
+ prioritized_encodings.append(sig_encoding)
168
+ logger.log(
169
+ TRACE,
170
+ "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
171
+ len(sig_payload),
172
+ sig_encoding,
173
+ )
174
+
175
+ prioritized_encodings.append("ascii")
176
+
177
+ if "utf_8" not in prioritized_encodings:
178
+ prioritized_encodings.append("utf_8")
179
+
180
+ for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
181
+ if cp_isolation and encoding_iana not in cp_isolation:
182
+ continue
183
+
184
+ if cp_exclusion and encoding_iana in cp_exclusion:
185
+ continue
186
+
187
+ if encoding_iana in tested:
188
+ continue
189
+
190
+ tested.add(encoding_iana)
191
+
192
+ decoded_payload: str | None = None
193
+ bom_or_sig_available: bool = sig_encoding == encoding_iana
194
+ strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
195
+ encoding_iana
196
+ )
197
+
198
+ if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
199
+ logger.log(
200
+ TRACE,
201
+ "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
202
+ encoding_iana,
203
+ )
204
+ continue
205
+ if encoding_iana in {"utf_7"} and not bom_or_sig_available:
206
+ logger.log(
207
+ TRACE,
208
+ "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
209
+ encoding_iana,
210
+ )
211
+ continue
212
+
213
+ try:
214
+ is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
215
+ except (ModuleNotFoundError, ImportError):
216
+ logger.log(
217
+ TRACE,
218
+ "Encoding %s does not provide an IncrementalDecoder",
219
+ encoding_iana,
220
+ )
221
+ continue
222
+
223
+ try:
224
+ if is_too_large_sequence and is_multi_byte_decoder is False:
225
+ str(
226
+ (
227
+ sequences[: int(50e4)]
228
+ if strip_sig_or_bom is False
229
+ else sequences[len(sig_payload) : int(50e4)]
230
+ ),
231
+ encoding=encoding_iana,
232
+ )
233
+ else:
234
+ decoded_payload = str(
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences[len(sig_payload) :]
239
+ ),
240
+ encoding=encoding_iana,
241
+ )
242
+ except (UnicodeDecodeError, LookupError) as e:
243
+ if not isinstance(e, LookupError):
244
+ logger.log(
245
+ TRACE,
246
+ "Code page %s does not fit given bytes sequence at ALL. %s",
247
+ encoding_iana,
248
+ str(e),
249
+ )
250
+ tested_but_hard_failure.append(encoding_iana)
251
+ continue
252
+
253
+ similar_soft_failure_test: bool = False
254
+
255
+ for encoding_soft_failed in tested_but_soft_failure:
256
+ if is_cp_similar(encoding_iana, encoding_soft_failed):
257
+ similar_soft_failure_test = True
258
+ break
259
+
260
+ if similar_soft_failure_test:
261
+ logger.log(
262
+ TRACE,
263
+ "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
264
+ encoding_iana,
265
+ encoding_soft_failed,
266
+ )
267
+ continue
268
+
269
+ r_ = range(
270
+ 0 if not bom_or_sig_available else len(sig_payload),
271
+ length,
272
+ int(length / steps),
273
+ )
274
+
275
+ multi_byte_bonus: bool = (
276
+ is_multi_byte_decoder
277
+ and decoded_payload is not None
278
+ and len(decoded_payload) < length
279
+ )
280
+
281
+ if multi_byte_bonus:
282
+ logger.log(
283
+ TRACE,
284
+ "Code page %s is a multi byte encoding table and it appear that at least one character "
285
+ "was encoded using n-bytes.",
286
+ encoding_iana,
287
+ )
288
+
289
+ max_chunk_gave_up: int = int(len(r_) / 4)
290
+
291
+ max_chunk_gave_up = max(max_chunk_gave_up, 2)
292
+ early_stop_count: int = 0
293
+ lazy_str_hard_failure = False
294
+
295
+ md_chunks: list[str] = []
296
+ md_ratios = []
297
+
298
+ try:
299
+ for chunk in cut_sequence_chunks(
300
+ sequences,
301
+ encoding_iana,
302
+ r_,
303
+ chunk_size,
304
+ bom_or_sig_available,
305
+ strip_sig_or_bom,
306
+ sig_payload,
307
+ is_multi_byte_decoder,
308
+ decoded_payload,
309
+ ):
310
+ md_chunks.append(chunk)
311
+
312
+ md_ratios.append(
313
+ mess_ratio(
314
+ chunk,
315
+ threshold,
316
+ explain is True and 1 <= len(cp_isolation) <= 2,
317
+ )
318
+ )
319
+
320
+ if md_ratios[-1] >= threshold:
321
+ early_stop_count += 1
322
+
323
+ if (early_stop_count >= max_chunk_gave_up) or (
324
+ bom_or_sig_available and strip_sig_or_bom is False
325
+ ):
326
+ break
327
+ except (
328
+ UnicodeDecodeError
329
+ ) as e: # Lazy str loading may have missed something there
330
+ logger.log(
331
+ TRACE,
332
+ "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
333
+ encoding_iana,
334
+ str(e),
335
+ )
336
+ early_stop_count = max_chunk_gave_up
337
+ lazy_str_hard_failure = True
338
+
339
+ # We might want to check the sequence again with the whole content
340
+ # Only if initial MD tests passes
341
+ if (
342
+ not lazy_str_hard_failure
343
+ and is_too_large_sequence
344
+ and not is_multi_byte_decoder
345
+ ):
346
+ try:
347
+ sequences[int(50e3) :].decode(encoding_iana, errors="strict")
348
+ except UnicodeDecodeError as e:
349
+ logger.log(
350
+ TRACE,
351
+ "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
352
+ encoding_iana,
353
+ str(e),
354
+ )
355
+ tested_but_hard_failure.append(encoding_iana)
356
+ continue
357
+
358
+ mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
359
+ if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
360
+ tested_but_soft_failure.append(encoding_iana)
361
+ logger.log(
362
+ TRACE,
363
+ "%s was excluded because of initial chaos probing. Gave up %i time(s). "
364
+ "Computed mean chaos is %f %%.",
365
+ encoding_iana,
366
+ early_stop_count,
367
+ round(mean_mess_ratio * 100, ndigits=3),
368
+ )
369
+ # Preparing those fallbacks in case we got nothing.
370
+ if (
371
+ enable_fallback
372
+ and encoding_iana in ["ascii", "utf_8", specified_encoding]
373
+ and not lazy_str_hard_failure
374
+ ):
375
+ fallback_entry = CharsetMatch(
376
+ sequences,
377
+ encoding_iana,
378
+ threshold,
379
+ False,
380
+ [],
381
+ decoded_payload,
382
+ preemptive_declaration=specified_encoding,
383
+ )
384
+ if encoding_iana == specified_encoding:
385
+ fallback_specified = fallback_entry
386
+ elif encoding_iana == "ascii":
387
+ fallback_ascii = fallback_entry
388
+ else:
389
+ fallback_u8 = fallback_entry
390
+ continue
391
+
392
+ logger.log(
393
+ TRACE,
394
+ "%s passed initial chaos probing. Mean measured chaos is %f %%",
395
+ encoding_iana,
396
+ round(mean_mess_ratio * 100, ndigits=3),
397
+ )
398
+
399
+ if not is_multi_byte_decoder:
400
+ target_languages: list[str] = encoding_languages(encoding_iana)
401
+ else:
402
+ target_languages = mb_encoding_languages(encoding_iana)
403
+
404
+ if target_languages:
405
+ logger.log(
406
+ TRACE,
407
+ "{} should target any language(s) of {}".format(
408
+ encoding_iana, str(target_languages)
409
+ ),
410
+ )
411
+
412
+ cd_ratios = []
413
+
414
+ # We shall skip the CD when its about ASCII
415
+ # Most of the time its not relevant to run "language-detection" on it.
416
+ if encoding_iana != "ascii":
417
+ for chunk in md_chunks:
418
+ chunk_languages = coherence_ratio(
419
+ chunk,
420
+ language_threshold,
421
+ ",".join(target_languages) if target_languages else None,
422
+ )
423
+
424
+ cd_ratios.append(chunk_languages)
425
+
426
+ cd_ratios_merged = merge_coherence_ratios(cd_ratios)
427
+
428
+ if cd_ratios_merged:
429
+ logger.log(
430
+ TRACE,
431
+ "We detected language {} using {}".format(
432
+ cd_ratios_merged, encoding_iana
433
+ ),
434
+ )
435
+
436
+ current_match = CharsetMatch(
437
+ sequences,
438
+ encoding_iana,
439
+ mean_mess_ratio,
440
+ bom_or_sig_available,
441
+ cd_ratios_merged,
442
+ (
443
+ decoded_payload
444
+ if (
445
+ is_too_large_sequence is False
446
+ or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447
+ )
448
+ else None
449
+ ),
450
+ preemptive_declaration=specified_encoding,
451
+ )
452
+
453
+ results.append(current_match)
454
+
455
+ if (
456
+ encoding_iana in [specified_encoding, "ascii", "utf_8"]
457
+ and mean_mess_ratio < 0.1
458
+ ):
459
+ # If md says nothing to worry about, then... stop immediately!
460
+ if mean_mess_ratio == 0.0:
461
+ logger.debug(
462
+ "Encoding detection: %s is most likely the one.",
463
+ current_match.encoding,
464
+ )
465
+ if explain: # Defensive: ensure exit path clean handler
466
+ logger.removeHandler(explain_handler)
467
+ logger.setLevel(previous_logger_level)
468
+ return CharsetMatches([current_match])
469
+
470
+ early_stop_results.append(current_match)
471
+
472
+ if (
473
+ len(early_stop_results)
474
+ and (specified_encoding is None or specified_encoding in tested)
475
+ and "ascii" in tested
476
+ and "utf_8" in tested
477
+ ):
478
+ probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
479
+ logger.debug(
480
+ "Encoding detection: %s is most likely the one.",
481
+ probable_result.encoding,
482
+ )
483
+ if explain: # Defensive: ensure exit path clean handler
484
+ logger.removeHandler(explain_handler)
485
+ logger.setLevel(previous_logger_level)
486
+
487
+ return CharsetMatches([probable_result])
488
+
489
+ if encoding_iana == sig_encoding:
490
+ logger.debug(
491
+ "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
492
+ "the beginning of the sequence.",
493
+ encoding_iana,
494
+ )
495
+ if explain: # Defensive: ensure exit path clean handler
496
+ logger.removeHandler(explain_handler)
497
+ logger.setLevel(previous_logger_level)
498
+ return CharsetMatches([results[encoding_iana]])
499
+
500
+ if len(results) == 0:
501
+ if fallback_u8 or fallback_ascii or fallback_specified:
502
+ logger.log(
503
+ TRACE,
504
+ "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
505
+ )
506
+
507
+ if fallback_specified:
508
+ logger.debug(
509
+ "Encoding detection: %s will be used as a fallback match",
510
+ fallback_specified.encoding,
511
+ )
512
+ results.append(fallback_specified)
513
+ elif (
514
+ (fallback_u8 and fallback_ascii is None)
515
+ or (
516
+ fallback_u8
517
+ and fallback_ascii
518
+ and fallback_u8.fingerprint != fallback_ascii.fingerprint
519
+ )
520
+ or (fallback_u8 is not None)
521
+ ):
522
+ logger.debug("Encoding detection: utf_8 will be used as a fallback match")
523
+ results.append(fallback_u8)
524
+ elif fallback_ascii:
525
+ logger.debug("Encoding detection: ascii will be used as a fallback match")
526
+ results.append(fallback_ascii)
527
+
528
+ if results:
529
+ logger.debug(
530
+ "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
531
+ results.best().encoding, # type: ignore
532
+ len(results) - 1,
533
+ )
534
+ else:
535
+ logger.debug("Encoding detection: Unable to determine any suitable charset.")
536
+
537
+ if explain:
538
+ logger.removeHandler(explain_handler)
539
+ logger.setLevel(previous_logger_level)
540
+
541
+ return results
542
+
543
+
544
+ def from_fp(
545
+ fp: BinaryIO,
546
+ steps: int = 5,
547
+ chunk_size: int = 512,
548
+ threshold: float = 0.20,
549
+ cp_isolation: list[str] | None = None,
550
+ cp_exclusion: list[str] | None = None,
551
+ preemptive_behaviour: bool = True,
552
+ explain: bool = False,
553
+ language_threshold: float = 0.1,
554
+ enable_fallback: bool = True,
555
+ ) -> CharsetMatches:
556
+ """
557
+ Same thing than the function from_bytes but using a file pointer that is already ready.
558
+ Will not close the file pointer.
559
+ """
560
+ return from_bytes(
561
+ fp.read(),
562
+ steps,
563
+ chunk_size,
564
+ threshold,
565
+ cp_isolation,
566
+ cp_exclusion,
567
+ preemptive_behaviour,
568
+ explain,
569
+ language_threshold,
570
+ enable_fallback,
571
+ )
572
+
573
+
574
+ def from_path(
575
+ path: str | bytes | PathLike, # type: ignore[type-arg]
576
+ steps: int = 5,
577
+ chunk_size: int = 512,
578
+ threshold: float = 0.20,
579
+ cp_isolation: list[str] | None = None,
580
+ cp_exclusion: list[str] | None = None,
581
+ preemptive_behaviour: bool = True,
582
+ explain: bool = False,
583
+ language_threshold: float = 0.1,
584
+ enable_fallback: bool = True,
585
+ ) -> CharsetMatches:
586
+ """
587
+ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
588
+ Can raise IOError.
589
+ """
590
+ with open(path, "rb") as fp:
591
+ return from_fp(
592
+ fp,
593
+ steps,
594
+ chunk_size,
595
+ threshold,
596
+ cp_isolation,
597
+ cp_exclusion,
598
+ preemptive_behaviour,
599
+ explain,
600
+ language_threshold,
601
+ enable_fallback,
602
+ )
603
+
604
+
605
+ def is_binary(
606
+ fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
607
+ steps: int = 5,
608
+ chunk_size: int = 512,
609
+ threshold: float = 0.20,
610
+ cp_isolation: list[str] | None = None,
611
+ cp_exclusion: list[str] | None = None,
612
+ preemptive_behaviour: bool = True,
613
+ explain: bool = False,
614
+ language_threshold: float = 0.1,
615
+ enable_fallback: bool = False,
616
+ ) -> bool:
617
+ """
618
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
+ """
622
+ if isinstance(fp_or_path_or_payload, (str, PathLike)):
623
+ guesses = from_path(
624
+ fp_or_path_or_payload,
625
+ steps=steps,
626
+ chunk_size=chunk_size,
627
+ threshold=threshold,
628
+ cp_isolation=cp_isolation,
629
+ cp_exclusion=cp_exclusion,
630
+ preemptive_behaviour=preemptive_behaviour,
631
+ explain=explain,
632
+ language_threshold=language_threshold,
633
+ enable_fallback=enable_fallback,
634
+ )
635
+ elif isinstance(
636
+ fp_or_path_or_payload,
637
+ (
638
+ bytes,
639
+ bytearray,
640
+ ),
641
+ ):
642
+ guesses = from_bytes(
643
+ fp_or_path_or_payload,
644
+ steps=steps,
645
+ chunk_size=chunk_size,
646
+ threshold=threshold,
647
+ cp_isolation=cp_isolation,
648
+ cp_exclusion=cp_exclusion,
649
+ preemptive_behaviour=preemptive_behaviour,
650
+ explain=explain,
651
+ language_threshold=language_threshold,
652
+ enable_fallback=enable_fallback,
653
+ )
654
+ else:
655
+ guesses = from_fp(
656
+ fp_or_path_or_payload,
657
+ steps=steps,
658
+ chunk_size=chunk_size,
659
+ threshold=threshold,
660
+ cp_isolation=cp_isolation,
661
+ cp_exclusion=cp_exclusion,
662
+ preemptive_behaviour=preemptive_behaviour,
663
+ explain=explain,
664
+ language_threshold=language_threshold,
665
+ enable_fallback=enable_fallback,
666
+ )
667
+
668
+ return not guesses
env/Lib/site-packages/charset_normalizer/cd.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from codecs import IncrementalDecoder
5
+ from collections import Counter
6
+ from functools import lru_cache
7
+ from typing import Counter as TypeCounter
8
+
9
+ from .constant import (
10
+ FREQUENCIES,
11
+ KO_NAMES,
12
+ LANGUAGE_SUPPORTED_COUNT,
13
+ TOO_SMALL_SEQUENCE,
14
+ ZH_NAMES,
15
+ )
16
+ from .md import is_suspiciously_successive_range
17
+ from .models import CoherenceMatches
18
+ from .utils import (
19
+ is_accentuated,
20
+ is_latin,
21
+ is_multi_byte_encoding,
22
+ is_unicode_range_secondary,
23
+ unicode_range,
24
+ )
25
+
26
+
27
+ def encoding_unicode_range(iana_name: str) -> list[str]:
28
+ """
29
+ Return associated unicode ranges in a single byte code page.
30
+ """
31
+ if is_multi_byte_encoding(iana_name):
32
+ raise OSError("Function not supported on multi-byte code page")
33
+
34
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
35
+
36
+ p: IncrementalDecoder = decoder(errors="ignore")
37
+ seen_ranges: dict[str, int] = {}
38
+ character_count: int = 0
39
+
40
+ for i in range(0x40, 0xFF):
41
+ chunk: str = p.decode(bytes([i]))
42
+
43
+ if chunk:
44
+ character_range: str | None = unicode_range(chunk)
45
+
46
+ if character_range is None:
47
+ continue
48
+
49
+ if is_unicode_range_secondary(character_range) is False:
50
+ if character_range not in seen_ranges:
51
+ seen_ranges[character_range] = 0
52
+ seen_ranges[character_range] += 1
53
+ character_count += 1
54
+
55
+ return sorted(
56
+ [
57
+ character_range
58
+ for character_range in seen_ranges
59
+ if seen_ranges[character_range] / character_count >= 0.15
60
+ ]
61
+ )
62
+
63
+
64
+ def unicode_range_languages(primary_range: str) -> list[str]:
65
+ """
66
+ Return inferred languages used with a unicode range.
67
+ """
68
+ languages: list[str] = []
69
+
70
+ for language, characters in FREQUENCIES.items():
71
+ for character in characters:
72
+ if unicode_range(character) == primary_range:
73
+ languages.append(language)
74
+ break
75
+
76
+ return languages
77
+
78
+
79
+ @lru_cache()
80
+ def encoding_languages(iana_name: str) -> list[str]:
81
+ """
82
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83
+ This function does the correspondence.
84
+ """
85
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
86
+ primary_range: str | None = None
87
+
88
+ for specified_range in unicode_ranges:
89
+ if "Latin" not in specified_range:
90
+ primary_range = specified_range
91
+ break
92
+
93
+ if primary_range is None:
94
+ return ["Latin Based"]
95
+
96
+ return unicode_range_languages(primary_range)
97
+
98
+
99
+ @lru_cache()
100
+ def mb_encoding_languages(iana_name: str) -> list[str]:
101
+ """
102
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103
+ This function does the correspondence.
104
+ """
105
+ if (
106
+ iana_name.startswith("shift_")
107
+ or iana_name.startswith("iso2022_jp")
108
+ or iana_name.startswith("euc_j")
109
+ or iana_name == "cp932"
110
+ ):
111
+ return ["Japanese"]
112
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113
+ return ["Chinese"]
114
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115
+ return ["Korean"]
116
+
117
+ return []
118
+
119
+
120
+ @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121
+ def get_target_features(language: str) -> tuple[bool, bool]:
122
+ """
123
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
124
+ """
125
+ target_have_accents: bool = False
126
+ target_pure_latin: bool = True
127
+
128
+ for character in FREQUENCIES[language]:
129
+ if not target_have_accents and is_accentuated(character):
130
+ target_have_accents = True
131
+ if target_pure_latin and is_latin(character) is False:
132
+ target_pure_latin = False
133
+
134
+ return target_have_accents, target_pure_latin
135
+
136
+
137
+ def alphabet_languages(
138
+ characters: list[str], ignore_non_latin: bool = False
139
+ ) -> list[str]:
140
+ """
141
+ Return associated languages associated to given characters.
142
+ """
143
+ languages: list[tuple[str, float]] = []
144
+
145
+ source_have_accents = any(is_accentuated(character) for character in characters)
146
+
147
+ for language, language_characters in FREQUENCIES.items():
148
+ target_have_accents, target_pure_latin = get_target_features(language)
149
+
150
+ if ignore_non_latin and target_pure_latin is False:
151
+ continue
152
+
153
+ if target_have_accents is False and source_have_accents:
154
+ continue
155
+
156
+ character_count: int = len(language_characters)
157
+
158
+ character_match_count: int = len(
159
+ [c for c in language_characters if c in characters]
160
+ )
161
+
162
+ ratio: float = character_match_count / character_count
163
+
164
+ if ratio >= 0.2:
165
+ languages.append((language, ratio))
166
+
167
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
+
169
+ return [compatible_language[0] for compatible_language in languages]
170
+
171
+
172
+ def characters_popularity_compare(
173
+ language: str, ordered_characters: list[str]
174
+ ) -> float:
175
+ """
176
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179
+ """
180
+ if language not in FREQUENCIES:
181
+ raise ValueError(f"{language} not available")
182
+
183
+ character_approved_count: int = 0
184
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
185
+
186
+ ordered_characters_count: int = len(ordered_characters)
187
+ target_language_characters_count: int = len(FREQUENCIES[language])
188
+
189
+ large_alphabet: bool = target_language_characters_count > 26
190
+
191
+ for character, character_rank in zip(
192
+ ordered_characters, range(0, ordered_characters_count)
193
+ ):
194
+ if character not in FREQUENCIES_language_set:
195
+ continue
196
+
197
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
198
+ expected_projection_ratio: float = (
199
+ target_language_characters_count / ordered_characters_count
200
+ )
201
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
+
203
+ if (
204
+ large_alphabet is False
205
+ and abs(character_rank_projection - character_rank_in_language) > 4
206
+ ):
207
+ continue
208
+
209
+ if (
210
+ large_alphabet is True
211
+ and abs(character_rank_projection - character_rank_in_language)
212
+ < target_language_characters_count / 3
213
+ ):
214
+ character_approved_count += 1
215
+ continue
216
+
217
+ characters_before_source: list[str] = FREQUENCIES[language][
218
+ 0:character_rank_in_language
219
+ ]
220
+ characters_after_source: list[str] = FREQUENCIES[language][
221
+ character_rank_in_language:
222
+ ]
223
+ characters_before: list[str] = ordered_characters[0:character_rank]
224
+ characters_after: list[str] = ordered_characters[character_rank:]
225
+
226
+ before_match_count: int = len(
227
+ set(characters_before) & set(characters_before_source)
228
+ )
229
+
230
+ after_match_count: int = len(
231
+ set(characters_after) & set(characters_after_source)
232
+ )
233
+
234
+ if len(characters_before_source) == 0 and before_match_count <= 4:
235
+ character_approved_count += 1
236
+ continue
237
+
238
+ if len(characters_after_source) == 0 and after_match_count <= 4:
239
+ character_approved_count += 1
240
+ continue
241
+
242
+ if (
243
+ before_match_count / len(characters_before_source) >= 0.4
244
+ or after_match_count / len(characters_after_source) >= 0.4
245
+ ):
246
+ character_approved_count += 1
247
+ continue
248
+
249
+ return character_approved_count / len(ordered_characters)
250
+
251
+
252
+ def alpha_unicode_split(decoded_sequence: str) -> list[str]:
253
+ """
254
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256
+ One containing the latin letters and the other hebrew.
257
+ """
258
+ layers: dict[str, str] = {}
259
+
260
+ for character in decoded_sequence:
261
+ if character.isalpha() is False:
262
+ continue
263
+
264
+ character_range: str | None = unicode_range(character)
265
+
266
+ if character_range is None:
267
+ continue
268
+
269
+ layer_target_range: str | None = None
270
+
271
+ for discovered_range in layers:
272
+ if (
273
+ is_suspiciously_successive_range(discovered_range, character_range)
274
+ is False
275
+ ):
276
+ layer_target_range = discovered_range
277
+ break
278
+
279
+ if layer_target_range is None:
280
+ layer_target_range = character_range
281
+
282
+ if layer_target_range not in layers:
283
+ layers[layer_target_range] = character.lower()
284
+ continue
285
+
286
+ layers[layer_target_range] += character.lower()
287
+
288
+ return list(layers.values())
289
+
290
+
291
+ def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
292
+ """
293
+ This function merge results previously given by the function coherence_ratio.
294
+ The return type is the same as coherence_ratio.
295
+ """
296
+ per_language_ratios: dict[str, list[float]] = {}
297
+ for result in results:
298
+ for sub_result in result:
299
+ language, ratio = sub_result
300
+ if language not in per_language_ratios:
301
+ per_language_ratios[language] = [ratio]
302
+ continue
303
+ per_language_ratios[language].append(ratio)
304
+
305
+ merge = [
306
+ (
307
+ language,
308
+ round(
309
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310
+ 4,
311
+ ),
312
+ )
313
+ for language in per_language_ratios
314
+ ]
315
+
316
+ return sorted(merge, key=lambda x: x[1], reverse=True)
317
+
318
+
319
+ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320
+ """
321
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
322
+ of "English". This function only keeps the best match and remove the em-dash in it.
323
+ """
324
+ index_results: dict[str, list[float]] = dict()
325
+
326
+ for result in results:
327
+ language, ratio = result
328
+ no_em_name: str = language.replace("—", "")
329
+
330
+ if no_em_name not in index_results:
331
+ index_results[no_em_name] = []
332
+
333
+ index_results[no_em_name].append(ratio)
334
+
335
+ if any(len(index_results[e]) > 1 for e in index_results):
336
+ filtered_results: CoherenceMatches = []
337
+
338
+ for language in index_results:
339
+ filtered_results.append((language, max(index_results[language])))
340
+
341
+ return filtered_results
342
+
343
+ return results
344
+
345
+
346
+ @lru_cache(maxsize=2048)
347
+ def coherence_ratio(
348
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
349
+ ) -> CoherenceMatches:
350
+ """
351
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352
+ A layer = Character extraction by alphabets/ranges.
353
+ """
354
+
355
+ results: list[tuple[str, float]] = []
356
+ ignore_non_latin: bool = False
357
+
358
+ sufficient_match_count: int = 0
359
+
360
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361
+ if "Latin Based" in lg_inclusion_list:
362
+ ignore_non_latin = True
363
+ lg_inclusion_list.remove("Latin Based")
364
+
365
+ for layer in alpha_unicode_split(decoded_sequence):
366
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
367
+ most_common = sequence_frequencies.most_common()
368
+
369
+ character_count: int = sum(o for c, o in most_common)
370
+
371
+ if character_count <= TOO_SMALL_SEQUENCE:
372
+ continue
373
+
374
+ popular_character_ordered: list[str] = [c for c, o in most_common]
375
+
376
+ for language in lg_inclusion_list or alphabet_languages(
377
+ popular_character_ordered, ignore_non_latin
378
+ ):
379
+ ratio: float = characters_popularity_compare(
380
+ language, popular_character_ordered
381
+ )
382
+
383
+ if ratio < threshold:
384
+ continue
385
+ elif ratio >= 0.8:
386
+ sufficient_match_count += 1
387
+
388
+ results.append((language, round(ratio, 4)))
389
+
390
+ if sufficient_match_count >= 3:
391
+ break
392
+
393
+ return sorted(
394
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395
+ )
env/Lib/site-packages/charset_normalizer/cli/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .__main__ import cli_detect, query_yes_no
4
+
5
+ __all__ = (
6
+ "cli_detect",
7
+ "query_yes_no",
8
+ )
env/Lib/site-packages/charset_normalizer/cli/__main__.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from json import dumps
6
+ from os.path import abspath, basename, dirname, join, realpath
7
+ from platform import python_version
8
+ from unicodedata import unidata_version
9
+
10
+ import charset_normalizer.md as md_module
11
+ from charset_normalizer import from_fp
12
+ from charset_normalizer.models import CliDetectionResult
13
+ from charset_normalizer.version import __version__
14
+
15
+
16
+ def query_yes_no(question: str, default: str = "yes") -> bool:
17
+ """Ask a yes/no question via input() and return their answer.
18
+
19
+ "question" is a string that is presented to the user.
20
+ "default" is the presumed answer if the user just hits <Enter>.
21
+ It must be "yes" (the default), "no" or None (meaning
22
+ an answer is required of the user).
23
+
24
+ The "answer" return value is True for "yes" or False for "no".
25
+
26
+ Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
27
+ """
28
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
29
+ if default is None:
30
+ prompt = " [y/n] "
31
+ elif default == "yes":
32
+ prompt = " [Y/n] "
33
+ elif default == "no":
34
+ prompt = " [y/N] "
35
+ else:
36
+ raise ValueError("invalid default answer: '%s'" % default)
37
+
38
+ while True:
39
+ sys.stdout.write(question + prompt)
40
+ choice = input().lower()
41
+ if default is not None and choice == "":
42
+ return valid[default]
43
+ elif choice in valid:
44
+ return valid[choice]
45
+ else:
46
+ sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
47
+
48
+
49
+ def cli_detect(argv: list[str] | None = None) -> int:
50
+ """
51
+ CLI assistant using ARGV and ArgumentParser
52
+ :param argv:
53
+ :return: 0 if everything is fine, anything else equal trouble
54
+ """
55
+ parser = argparse.ArgumentParser(
56
+ description="The Real First Universal Charset Detector. "
57
+ "Discover originating encoding used on text file. "
58
+ "Normalize text to unicode."
59
+ )
60
+
61
+ parser.add_argument(
62
+ "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
63
+ )
64
+ parser.add_argument(
65
+ "-v",
66
+ "--verbose",
67
+ action="store_true",
68
+ default=False,
69
+ dest="verbose",
70
+ help="Display complementary information about file if any. "
71
+ "Stdout will contain logs about the detection process.",
72
+ )
73
+ parser.add_argument(
74
+ "-a",
75
+ "--with-alternative",
76
+ action="store_true",
77
+ default=False,
78
+ dest="alternatives",
79
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
80
+ )
81
+ parser.add_argument(
82
+ "-n",
83
+ "--normalize",
84
+ action="store_true",
85
+ default=False,
86
+ dest="normalize",
87
+ help="Permit to normalize input file. If not set, program does not write anything.",
88
+ )
89
+ parser.add_argument(
90
+ "-m",
91
+ "--minimal",
92
+ action="store_true",
93
+ default=False,
94
+ dest="minimal",
95
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
96
+ )
97
+ parser.add_argument(
98
+ "-r",
99
+ "--replace",
100
+ action="store_true",
101
+ default=False,
102
+ dest="replace",
103
+ help="Replace file when trying to normalize it instead of creating a new one.",
104
+ )
105
+ parser.add_argument(
106
+ "-f",
107
+ "--force",
108
+ action="store_true",
109
+ default=False,
110
+ dest="force",
111
+ help="Replace file without asking if you are sure, use this flag with caution.",
112
+ )
113
+ parser.add_argument(
114
+ "-i",
115
+ "--no-preemptive",
116
+ action="store_true",
117
+ default=False,
118
+ dest="no_preemptive",
119
+ help="Disable looking at a charset declaration to hint the detector.",
120
+ )
121
+ parser.add_argument(
122
+ "-t",
123
+ "--threshold",
124
+ action="store",
125
+ default=0.2,
126
+ type=float,
127
+ dest="threshold",
128
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
129
+ )
130
+ parser.add_argument(
131
+ "--version",
132
+ action="version",
133
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
134
+ __version__,
135
+ python_version(),
136
+ unidata_version,
137
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
138
+ ),
139
+ help="Show version information and exit.",
140
+ )
141
+
142
+ args = parser.parse_args(argv)
143
+
144
+ if args.replace is True and args.normalize is False:
145
+ if args.files:
146
+ for my_file in args.files:
147
+ my_file.close()
148
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
149
+ return 1
150
+
151
+ if args.force is True and args.replace is False:
152
+ if args.files:
153
+ for my_file in args.files:
154
+ my_file.close()
155
+ print("Use --force in addition of --replace only.", file=sys.stderr)
156
+ return 1
157
+
158
+ if args.threshold < 0.0 or args.threshold > 1.0:
159
+ if args.files:
160
+ for my_file in args.files:
161
+ my_file.close()
162
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
163
+ return 1
164
+
165
+ x_ = []
166
+
167
+ for my_file in args.files:
168
+ matches = from_fp(
169
+ my_file,
170
+ threshold=args.threshold,
171
+ explain=args.verbose,
172
+ preemptive_behaviour=args.no_preemptive is False,
173
+ )
174
+
175
+ best_guess = matches.best()
176
+
177
+ if best_guess is None:
178
+ print(
179
+ 'Unable to identify originating encoding for "{}". {}'.format(
180
+ my_file.name,
181
+ (
182
+ "Maybe try increasing maximum amount of chaos."
183
+ if args.threshold < 1.0
184
+ else ""
185
+ ),
186
+ ),
187
+ file=sys.stderr,
188
+ )
189
+ x_.append(
190
+ CliDetectionResult(
191
+ abspath(my_file.name),
192
+ None,
193
+ [],
194
+ [],
195
+ "Unknown",
196
+ [],
197
+ False,
198
+ 1.0,
199
+ 0.0,
200
+ None,
201
+ True,
202
+ )
203
+ )
204
+ else:
205
+ x_.append(
206
+ CliDetectionResult(
207
+ abspath(my_file.name),
208
+ best_guess.encoding,
209
+ best_guess.encoding_aliases,
210
+ [
211
+ cp
212
+ for cp in best_guess.could_be_from_charset
213
+ if cp != best_guess.encoding
214
+ ],
215
+ best_guess.language,
216
+ best_guess.alphabets,
217
+ best_guess.bom,
218
+ best_guess.percent_chaos,
219
+ best_guess.percent_coherence,
220
+ None,
221
+ True,
222
+ )
223
+ )
224
+
225
+ if len(matches) > 1 and args.alternatives:
226
+ for el in matches:
227
+ if el != best_guess:
228
+ x_.append(
229
+ CliDetectionResult(
230
+ abspath(my_file.name),
231
+ el.encoding,
232
+ el.encoding_aliases,
233
+ [
234
+ cp
235
+ for cp in el.could_be_from_charset
236
+ if cp != el.encoding
237
+ ],
238
+ el.language,
239
+ el.alphabets,
240
+ el.bom,
241
+ el.percent_chaos,
242
+ el.percent_coherence,
243
+ None,
244
+ False,
245
+ )
246
+ )
247
+
248
+ if args.normalize is True:
249
+ if best_guess.encoding.startswith("utf") is True:
250
+ print(
251
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
252
+ my_file.name
253
+ ),
254
+ file=sys.stderr,
255
+ )
256
+ if my_file.closed is False:
257
+ my_file.close()
258
+ continue
259
+
260
+ dir_path = dirname(realpath(my_file.name))
261
+ file_name = basename(realpath(my_file.name))
262
+
263
+ o_: list[str] = file_name.split(".")
264
+
265
+ if args.replace is False:
266
+ o_.insert(-1, best_guess.encoding)
267
+ if my_file.closed is False:
268
+ my_file.close()
269
+ elif (
270
+ args.force is False
271
+ and query_yes_no(
272
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
273
+ my_file.name
274
+ ),
275
+ "no",
276
+ )
277
+ is False
278
+ ):
279
+ if my_file.closed is False:
280
+ my_file.close()
281
+ continue
282
+
283
+ try:
284
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
285
+
286
+ with open(x_[0].unicode_path, "wb") as fp:
287
+ fp.write(best_guess.output())
288
+ except OSError as e:
289
+ print(str(e), file=sys.stderr)
290
+ if my_file.closed is False:
291
+ my_file.close()
292
+ return 2
293
+
294
+ if my_file.closed is False:
295
+ my_file.close()
296
+
297
+ if args.minimal is False:
298
+ print(
299
+ dumps(
300
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
301
+ ensure_ascii=True,
302
+ indent=4,
303
+ )
304
+ )
305
+ else:
306
+ for my_file in args.files:
307
+ print(
308
+ ", ".join(
309
+ [
310
+ el.encoding or "undefined"
311
+ for el in x_
312
+ if el.path == abspath(my_file.name)
313
+ ]
314
+ )
315
+ )
316
+
317
+ return 0
318
+
319
+
320
+ if __name__ == "__main__":
321
+ cli_detect()
env/Lib/site-packages/charset_normalizer/constant.py ADDED
@@ -0,0 +1,1998 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
4
+ from encodings.aliases import aliases
5
+ from re import IGNORECASE
6
+ from re import compile as re_compile
7
+
8
+ # Contain for each eligible encoding a list of/item bytes SIG/BOM
9
+ ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
10
+ "utf_8": BOM_UTF8,
11
+ "utf_7": [
12
+ b"\x2b\x2f\x76\x38",
13
+ b"\x2b\x2f\x76\x39",
14
+ b"\x2b\x2f\x76\x2b",
15
+ b"\x2b\x2f\x76\x2f",
16
+ b"\x2b\x2f\x76\x38\x2d",
17
+ ],
18
+ "gb18030": b"\x84\x31\x95\x33",
19
+ "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
20
+ "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
21
+ }
22
+
23
+ TOO_SMALL_SEQUENCE: int = 32
24
+ TOO_BIG_SEQUENCE: int = int(10e6)
25
+
26
+ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
27
+
28
+ # Up-to-date Unicode ucd/15.0.0
29
+ UNICODE_RANGES_COMBINED: dict[str, range] = {
30
+ "Control character": range(32),
31
+ "Basic Latin": range(32, 128),
32
+ "Latin-1 Supplement": range(128, 256),
33
+ "Latin Extended-A": range(256, 384),
34
+ "Latin Extended-B": range(384, 592),
35
+ "IPA Extensions": range(592, 688),
36
+ "Spacing Modifier Letters": range(688, 768),
37
+ "Combining Diacritical Marks": range(768, 880),
38
+ "Greek and Coptic": range(880, 1024),
39
+ "Cyrillic": range(1024, 1280),
40
+ "Cyrillic Supplement": range(1280, 1328),
41
+ "Armenian": range(1328, 1424),
42
+ "Hebrew": range(1424, 1536),
43
+ "Arabic": range(1536, 1792),
44
+ "Syriac": range(1792, 1872),
45
+ "Arabic Supplement": range(1872, 1920),
46
+ "Thaana": range(1920, 1984),
47
+ "NKo": range(1984, 2048),
48
+ "Samaritan": range(2048, 2112),
49
+ "Mandaic": range(2112, 2144),
50
+ "Syriac Supplement": range(2144, 2160),
51
+ "Arabic Extended-B": range(2160, 2208),
52
+ "Arabic Extended-A": range(2208, 2304),
53
+ "Devanagari": range(2304, 2432),
54
+ "Bengali": range(2432, 2560),
55
+ "Gurmukhi": range(2560, 2688),
56
+ "Gujarati": range(2688, 2816),
57
+ "Oriya": range(2816, 2944),
58
+ "Tamil": range(2944, 3072),
59
+ "Telugu": range(3072, 3200),
60
+ "Kannada": range(3200, 3328),
61
+ "Malayalam": range(3328, 3456),
62
+ "Sinhala": range(3456, 3584),
63
+ "Thai": range(3584, 3712),
64
+ "Lao": range(3712, 3840),
65
+ "Tibetan": range(3840, 4096),
66
+ "Myanmar": range(4096, 4256),
67
+ "Georgian": range(4256, 4352),
68
+ "Hangul Jamo": range(4352, 4608),
69
+ "Ethiopic": range(4608, 4992),
70
+ "Ethiopic Supplement": range(4992, 5024),
71
+ "Cherokee": range(5024, 5120),
72
+ "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
73
+ "Ogham": range(5760, 5792),
74
+ "Runic": range(5792, 5888),
75
+ "Tagalog": range(5888, 5920),
76
+ "Hanunoo": range(5920, 5952),
77
+ "Buhid": range(5952, 5984),
78
+ "Tagbanwa": range(5984, 6016),
79
+ "Khmer": range(6016, 6144),
80
+ "Mongolian": range(6144, 6320),
81
+ "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
82
+ "Limbu": range(6400, 6480),
83
+ "Tai Le": range(6480, 6528),
84
+ "New Tai Lue": range(6528, 6624),
85
+ "Khmer Symbols": range(6624, 6656),
86
+ "Buginese": range(6656, 6688),
87
+ "Tai Tham": range(6688, 6832),
88
+ "Combining Diacritical Marks Extended": range(6832, 6912),
89
+ "Balinese": range(6912, 7040),
90
+ "Sundanese": range(7040, 7104),
91
+ "Batak": range(7104, 7168),
92
+ "Lepcha": range(7168, 7248),
93
+ "Ol Chiki": range(7248, 7296),
94
+ "Cyrillic Extended-C": range(7296, 7312),
95
+ "Georgian Extended": range(7312, 7360),
96
+ "Sundanese Supplement": range(7360, 7376),
97
+ "Vedic Extensions": range(7376, 7424),
98
+ "Phonetic Extensions": range(7424, 7552),
99
+ "Phonetic Extensions Supplement": range(7552, 7616),
100
+ "Combining Diacritical Marks Supplement": range(7616, 7680),
101
+ "Latin Extended Additional": range(7680, 7936),
102
+ "Greek Extended": range(7936, 8192),
103
+ "General Punctuation": range(8192, 8304),
104
+ "Superscripts and Subscripts": range(8304, 8352),
105
+ "Currency Symbols": range(8352, 8400),
106
+ "Combining Diacritical Marks for Symbols": range(8400, 8448),
107
+ "Letterlike Symbols": range(8448, 8528),
108
+ "Number Forms": range(8528, 8592),
109
+ "Arrows": range(8592, 8704),
110
+ "Mathematical Operators": range(8704, 8960),
111
+ "Miscellaneous Technical": range(8960, 9216),
112
+ "Control Pictures": range(9216, 9280),
113
+ "Optical Character Recognition": range(9280, 9312),
114
+ "Enclosed Alphanumerics": range(9312, 9472),
115
+ "Box Drawing": range(9472, 9600),
116
+ "Block Elements": range(9600, 9632),
117
+ "Geometric Shapes": range(9632, 9728),
118
+ "Miscellaneous Symbols": range(9728, 9984),
119
+ "Dingbats": range(9984, 10176),
120
+ "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
121
+ "Supplemental Arrows-A": range(10224, 10240),
122
+ "Braille Patterns": range(10240, 10496),
123
+ "Supplemental Arrows-B": range(10496, 10624),
124
+ "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
125
+ "Supplemental Mathematical Operators": range(10752, 11008),
126
+ "Miscellaneous Symbols and Arrows": range(11008, 11264),
127
+ "Glagolitic": range(11264, 11360),
128
+ "Latin Extended-C": range(11360, 11392),
129
+ "Coptic": range(11392, 11520),
130
+ "Georgian Supplement": range(11520, 11568),
131
+ "Tifinagh": range(11568, 11648),
132
+ "Ethiopic Extended": range(11648, 11744),
133
+ "Cyrillic Extended-A": range(11744, 11776),
134
+ "Supplemental Punctuation": range(11776, 11904),
135
+ "CJK Radicals Supplement": range(11904, 12032),
136
+ "Kangxi Radicals": range(12032, 12256),
137
+ "Ideographic Description Characters": range(12272, 12288),
138
+ "CJK Symbols and Punctuation": range(12288, 12352),
139
+ "Hiragana": range(12352, 12448),
140
+ "Katakana": range(12448, 12544),
141
+ "Bopomofo": range(12544, 12592),
142
+ "Hangul Compatibility Jamo": range(12592, 12688),
143
+ "Kanbun": range(12688, 12704),
144
+ "Bopomofo Extended": range(12704, 12736),
145
+ "CJK Strokes": range(12736, 12784),
146
+ "Katakana Phonetic Extensions": range(12784, 12800),
147
+ "Enclosed CJK Letters and Months": range(12800, 13056),
148
+ "CJK Compatibility": range(13056, 13312),
149
+ "CJK Unified Ideographs Extension A": range(13312, 19904),
150
+ "Yijing Hexagram Symbols": range(19904, 19968),
151
+ "CJK Unified Ideographs": range(19968, 40960),
152
+ "Yi Syllables": range(40960, 42128),
153
+ "Yi Radicals": range(42128, 42192),
154
+ "Lisu": range(42192, 42240),
155
+ "Vai": range(42240, 42560),
156
+ "Cyrillic Extended-B": range(42560, 42656),
157
+ "Bamum": range(42656, 42752),
158
+ "Modifier Tone Letters": range(42752, 42784),
159
+ "Latin Extended-D": range(42784, 43008),
160
+ "Syloti Nagri": range(43008, 43056),
161
+ "Common Indic Number Forms": range(43056, 43072),
162
+ "Phags-pa": range(43072, 43136),
163
+ "Saurashtra": range(43136, 43232),
164
+ "Devanagari Extended": range(43232, 43264),
165
+ "Kayah Li": range(43264, 43312),
166
+ "Rejang": range(43312, 43360),
167
+ "Hangul Jamo Extended-A": range(43360, 43392),
168
+ "Javanese": range(43392, 43488),
169
+ "Myanmar Extended-B": range(43488, 43520),
170
+ "Cham": range(43520, 43616),
171
+ "Myanmar Extended-A": range(43616, 43648),
172
+ "Tai Viet": range(43648, 43744),
173
+ "Meetei Mayek Extensions": range(43744, 43776),
174
+ "Ethiopic Extended-A": range(43776, 43824),
175
+ "Latin Extended-E": range(43824, 43888),
176
+ "Cherokee Supplement": range(43888, 43968),
177
+ "Meetei Mayek": range(43968, 44032),
178
+ "Hangul Syllables": range(44032, 55216),
179
+ "Hangul Jamo Extended-B": range(55216, 55296),
180
+ "High Surrogates": range(55296, 56192),
181
+ "High Private Use Surrogates": range(56192, 56320),
182
+ "Low Surrogates": range(56320, 57344),
183
+ "Private Use Area": range(57344, 63744),
184
+ "CJK Compatibility Ideographs": range(63744, 64256),
185
+ "Alphabetic Presentation Forms": range(64256, 64336),
186
+ "Arabic Presentation Forms-A": range(64336, 65024),
187
+ "Variation Selectors": range(65024, 65040),
188
+ "Vertical Forms": range(65040, 65056),
189
+ "Combining Half Marks": range(65056, 65072),
190
+ "CJK Compatibility Forms": range(65072, 65104),
191
+ "Small Form Variants": range(65104, 65136),
192
+ "Arabic Presentation Forms-B": range(65136, 65280),
193
+ "Halfwidth and Fullwidth Forms": range(65280, 65520),
194
+ "Specials": range(65520, 65536),
195
+ "Linear B Syllabary": range(65536, 65664),
196
+ "Linear B Ideograms": range(65664, 65792),
197
+ "Aegean Numbers": range(65792, 65856),
198
+ "Ancient Greek Numbers": range(65856, 65936),
199
+ "Ancient Symbols": range(65936, 66000),
200
+ "Phaistos Disc": range(66000, 66048),
201
+ "Lycian": range(66176, 66208),
202
+ "Carian": range(66208, 66272),
203
+ "Coptic Epact Numbers": range(66272, 66304),
204
+ "Old Italic": range(66304, 66352),
205
+ "Gothic": range(66352, 66384),
206
+ "Old Permic": range(66384, 66432),
207
+ "Ugaritic": range(66432, 66464),
208
+ "Old Persian": range(66464, 66528),
209
+ "Deseret": range(66560, 66640),
210
+ "Shavian": range(66640, 66688),
211
+ "Osmanya": range(66688, 66736),
212
+ "Osage": range(66736, 66816),
213
+ "Elbasan": range(66816, 66864),
214
+ "Caucasian Albanian": range(66864, 66928),
215
+ "Vithkuqi": range(66928, 67008),
216
+ "Linear A": range(67072, 67456),
217
+ "Latin Extended-F": range(67456, 67520),
218
+ "Cypriot Syllabary": range(67584, 67648),
219
+ "Imperial Aramaic": range(67648, 67680),
220
+ "Palmyrene": range(67680, 67712),
221
+ "Nabataean": range(67712, 67760),
222
+ "Hatran": range(67808, 67840),
223
+ "Phoenician": range(67840, 67872),
224
+ "Lydian": range(67872, 67904),
225
+ "Meroitic Hieroglyphs": range(67968, 68000),
226
+ "Meroitic Cursive": range(68000, 68096),
227
+ "Kharoshthi": range(68096, 68192),
228
+ "Old South Arabian": range(68192, 68224),
229
+ "Old North Arabian": range(68224, 68256),
230
+ "Manichaean": range(68288, 68352),
231
+ "Avestan": range(68352, 68416),
232
+ "Inscriptional Parthian": range(68416, 68448),
233
+ "Inscriptional Pahlavi": range(68448, 68480),
234
+ "Psalter Pahlavi": range(68480, 68528),
235
+ "Old Turkic": range(68608, 68688),
236
+ "Old Hungarian": range(68736, 68864),
237
+ "Hanifi Rohingya": range(68864, 68928),
238
+ "Rumi Numeral Symbols": range(69216, 69248),
239
+ "Yezidi": range(69248, 69312),
240
+ "Arabic Extended-C": range(69312, 69376),
241
+ "Old Sogdian": range(69376, 69424),
242
+ "Sogdian": range(69424, 69488),
243
+ "Old Uyghur": range(69488, 69552),
244
+ "Chorasmian": range(69552, 69600),
245
+ "Elymaic": range(69600, 69632),
246
+ "Brahmi": range(69632, 69760),
247
+ "Kaithi": range(69760, 69840),
248
+ "Sora Sompeng": range(69840, 69888),
249
+ "Chakma": range(69888, 69968),
250
+ "Mahajani": range(69968, 70016),
251
+ "Sharada": range(70016, 70112),
252
+ "Sinhala Archaic Numbers": range(70112, 70144),
253
+ "Khojki": range(70144, 70224),
254
+ "Multani": range(70272, 70320),
255
+ "Khudawadi": range(70320, 70400),
256
+ "Grantha": range(70400, 70528),
257
+ "Newa": range(70656, 70784),
258
+ "Tirhuta": range(70784, 70880),
259
+ "Siddham": range(71040, 71168),
260
+ "Modi": range(71168, 71264),
261
+ "Mongolian Supplement": range(71264, 71296),
262
+ "Takri": range(71296, 71376),
263
+ "Ahom": range(71424, 71504),
264
+ "Dogra": range(71680, 71760),
265
+ "Warang Citi": range(71840, 71936),
266
+ "Dives Akuru": range(71936, 72032),
267
+ "Nandinagari": range(72096, 72192),
268
+ "Zanabazar Square": range(72192, 72272),
269
+ "Soyombo": range(72272, 72368),
270
+ "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
271
+ "Pau Cin Hau": range(72384, 72448),
272
+ "Devanagari Extended-A": range(72448, 72544),
273
+ "Bhaiksuki": range(72704, 72816),
274
+ "Marchen": range(72816, 72896),
275
+ "Masaram Gondi": range(72960, 73056),
276
+ "Gunjala Gondi": range(73056, 73136),
277
+ "Makasar": range(73440, 73472),
278
+ "Kawi": range(73472, 73568),
279
+ "Lisu Supplement": range(73648, 73664),
280
+ "Tamil Supplement": range(73664, 73728),
281
+ "Cuneiform": range(73728, 74752),
282
+ "Cuneiform Numbers and Punctuation": range(74752, 74880),
283
+ "Early Dynastic Cuneiform": range(74880, 75088),
284
+ "Cypro-Minoan": range(77712, 77824),
285
+ "Egyptian Hieroglyphs": range(77824, 78896),
286
+ "Egyptian Hieroglyph Format Controls": range(78896, 78944),
287
+ "Anatolian Hieroglyphs": range(82944, 83584),
288
+ "Bamum Supplement": range(92160, 92736),
289
+ "Mro": range(92736, 92784),
290
+ "Tangsa": range(92784, 92880),
291
+ "Bassa Vah": range(92880, 92928),
292
+ "Pahawh Hmong": range(92928, 93072),
293
+ "Medefaidrin": range(93760, 93856),
294
+ "Miao": range(93952, 94112),
295
+ "Ideographic Symbols and Punctuation": range(94176, 94208),
296
+ "Tangut": range(94208, 100352),
297
+ "Tangut Components": range(100352, 101120),
298
+ "Khitan Small Script": range(101120, 101632),
299
+ "Tangut Supplement": range(101632, 101760),
300
+ "Kana Extended-B": range(110576, 110592),
301
+ "Kana Supplement": range(110592, 110848),
302
+ "Kana Extended-A": range(110848, 110896),
303
+ "Small Kana Extension": range(110896, 110960),
304
+ "Nushu": range(110960, 111360),
305
+ "Duployan": range(113664, 113824),
306
+ "Shorthand Format Controls": range(113824, 113840),
307
+ "Znamenny Musical Notation": range(118528, 118736),
308
+ "Byzantine Musical Symbols": range(118784, 119040),
309
+ "Musical Symbols": range(119040, 119296),
310
+ "Ancient Greek Musical Notation": range(119296, 119376),
311
+ "Kaktovik Numerals": range(119488, 119520),
312
+ "Mayan Numerals": range(119520, 119552),
313
+ "Tai Xuan Jing Symbols": range(119552, 119648),
314
+ "Counting Rod Numerals": range(119648, 119680),
315
+ "Mathematical Alphanumeric Symbols": range(119808, 120832),
316
+ "Sutton SignWriting": range(120832, 121520),
317
+ "Latin Extended-G": range(122624, 122880),
318
+ "Glagolitic Supplement": range(122880, 122928),
319
+ "Cyrillic Extended-D": range(122928, 123024),
320
+ "Nyiakeng Puachue Hmong": range(123136, 123216),
321
+ "Toto": range(123536, 123584),
322
+ "Wancho": range(123584, 123648),
323
+ "Nag Mundari": range(124112, 124160),
324
+ "Ethiopic Extended-B": range(124896, 124928),
325
+ "Mende Kikakui": range(124928, 125152),
326
+ "Adlam": range(125184, 125280),
327
+ "Indic Siyaq Numbers": range(126064, 126144),
328
+ "Ottoman Siyaq Numbers": range(126208, 126288),
329
+ "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
330
+ "Mahjong Tiles": range(126976, 127024),
331
+ "Domino Tiles": range(127024, 127136),
332
+ "Playing Cards": range(127136, 127232),
333
+ "Enclosed Alphanumeric Supplement": range(127232, 127488),
334
+ "Enclosed Ideographic Supplement": range(127488, 127744),
335
+ "Miscellaneous Symbols and Pictographs": range(127744, 128512),
336
+ "Emoticons range(Emoji)": range(128512, 128592),
337
+ "Ornamental Dingbats": range(128592, 128640),
338
+ "Transport and Map Symbols": range(128640, 128768),
339
+ "Alchemical Symbols": range(128768, 128896),
340
+ "Geometric Shapes Extended": range(128896, 129024),
341
+ "Supplemental Arrows-C": range(129024, 129280),
342
+ "Supplemental Symbols and Pictographs": range(129280, 129536),
343
+ "Chess Symbols": range(129536, 129648),
344
+ "Symbols and Pictographs Extended-A": range(129648, 129792),
345
+ "Symbols for Legacy Computing": range(129792, 130048),
346
+ "CJK Unified Ideographs Extension B": range(131072, 173792),
347
+ "CJK Unified Ideographs Extension C": range(173824, 177984),
348
+ "CJK Unified Ideographs Extension D": range(177984, 178208),
349
+ "CJK Unified Ideographs Extension E": range(178208, 183984),
350
+ "CJK Unified Ideographs Extension F": range(183984, 191472),
351
+ "CJK Compatibility Ideographs Supplement": range(194560, 195104),
352
+ "CJK Unified Ideographs Extension G": range(196608, 201552),
353
+ "CJK Unified Ideographs Extension H": range(201552, 205744),
354
+ "Tags": range(917504, 917632),
355
+ "Variation Selectors Supplement": range(917760, 918000),
356
+ "Supplementary Private Use Area-A": range(983040, 1048576),
357
+ "Supplementary Private Use Area-B": range(1048576, 1114112),
358
+ }
359
+
360
+
361
+ UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
362
+ "Supplement",
363
+ "Extended",
364
+ "Extensions",
365
+ "Modifier",
366
+ "Marks",
367
+ "Punctuation",
368
+ "Symbols",
369
+ "Forms",
370
+ "Operators",
371
+ "Miscellaneous",
372
+ "Drawing",
373
+ "Block",
374
+ "Shapes",
375
+ "Supplemental",
376
+ "Tags",
377
+ ]
378
+
379
+ RE_POSSIBLE_ENCODING_INDICATION = re_compile(
380
+ r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
381
+ IGNORECASE,
382
+ )
383
+
384
+ IANA_NO_ALIASES = [
385
+ "cp720",
386
+ "cp737",
387
+ "cp856",
388
+ "cp874",
389
+ "cp875",
390
+ "cp1006",
391
+ "koi8_r",
392
+ "koi8_t",
393
+ "koi8_u",
394
+ ]
395
+
396
+ IANA_SUPPORTED: list[str] = sorted(
397
+ filter(
398
+ lambda x: x.endswith("_codec") is False
399
+ and x not in {"rot_13", "tactis", "mbcs"},
400
+ list(set(aliases.values())) + IANA_NO_ALIASES,
401
+ )
402
+ )
403
+
404
+ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
405
+
406
+ # pre-computed code page that are similar using the function cp_similarity.
407
+ IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
408
+ "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
409
+ "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
410
+ "cp1125": ["cp866"],
411
+ "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
412
+ "cp1250": ["iso8859_2"],
413
+ "cp1251": ["kz1048", "ptcp154"],
414
+ "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
415
+ "cp1253": ["iso8859_7"],
416
+ "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
417
+ "cp1257": ["iso8859_13"],
418
+ "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
419
+ "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
420
+ "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
421
+ "cp850": ["cp437", "cp857", "cp858", "cp865"],
422
+ "cp857": ["cp850", "cp858", "cp865"],
423
+ "cp858": ["cp437", "cp850", "cp857", "cp865"],
424
+ "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
425
+ "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
426
+ "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
427
+ "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
428
+ "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
429
+ "cp866": ["cp1125"],
430
+ "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
431
+ "iso8859_11": ["tis_620"],
432
+ "iso8859_13": ["cp1257"],
433
+ "iso8859_14": [
434
+ "iso8859_10",
435
+ "iso8859_15",
436
+ "iso8859_16",
437
+ "iso8859_3",
438
+ "iso8859_9",
439
+ "latin_1",
440
+ ],
441
+ "iso8859_15": [
442
+ "cp1252",
443
+ "cp1254",
444
+ "iso8859_10",
445
+ "iso8859_14",
446
+ "iso8859_16",
447
+ "iso8859_3",
448
+ "iso8859_9",
449
+ "latin_1",
450
+ ],
451
+ "iso8859_16": [
452
+ "iso8859_14",
453
+ "iso8859_15",
454
+ "iso8859_2",
455
+ "iso8859_3",
456
+ "iso8859_9",
457
+ "latin_1",
458
+ ],
459
+ "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
460
+ "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
461
+ "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
462
+ "iso8859_7": ["cp1253"],
463
+ "iso8859_9": [
464
+ "cp1252",
465
+ "cp1254",
466
+ "cp1258",
467
+ "iso8859_10",
468
+ "iso8859_14",
469
+ "iso8859_15",
470
+ "iso8859_16",
471
+ "iso8859_3",
472
+ "iso8859_4",
473
+ "latin_1",
474
+ ],
475
+ "kz1048": ["cp1251", "ptcp154"],
476
+ "latin_1": [
477
+ "cp1252",
478
+ "cp1254",
479
+ "cp1258",
480
+ "iso8859_10",
481
+ "iso8859_14",
482
+ "iso8859_15",
483
+ "iso8859_16",
484
+ "iso8859_3",
485
+ "iso8859_4",
486
+ "iso8859_9",
487
+ ],
488
+ "mac_iceland": ["mac_roman", "mac_turkish"],
489
+ "mac_roman": ["mac_iceland", "mac_turkish"],
490
+ "mac_turkish": ["mac_iceland", "mac_roman"],
491
+ "ptcp154": ["cp1251", "kz1048"],
492
+ "tis_620": ["iso8859_11"],
493
+ }
494
+
495
+
496
+ CHARDET_CORRESPONDENCE: dict[str, str] = {
497
+ "iso2022_kr": "ISO-2022-KR",
498
+ "iso2022_jp": "ISO-2022-JP",
499
+ "euc_kr": "EUC-KR",
500
+ "tis_620": "TIS-620",
501
+ "utf_32": "UTF-32",
502
+ "euc_jp": "EUC-JP",
503
+ "koi8_r": "KOI8-R",
504
+ "iso8859_1": "ISO-8859-1",
505
+ "iso8859_2": "ISO-8859-2",
506
+ "iso8859_5": "ISO-8859-5",
507
+ "iso8859_6": "ISO-8859-6",
508
+ "iso8859_7": "ISO-8859-7",
509
+ "iso8859_8": "ISO-8859-8",
510
+ "utf_16": "UTF-16",
511
+ "cp855": "IBM855",
512
+ "mac_cyrillic": "MacCyrillic",
513
+ "gb2312": "GB2312",
514
+ "gb18030": "GB18030",
515
+ "cp932": "CP932",
516
+ "cp866": "IBM866",
517
+ "utf_8": "utf-8",
518
+ "utf_8_sig": "UTF-8-SIG",
519
+ "shift_jis": "SHIFT_JIS",
520
+ "big5": "Big5",
521
+ "cp1250": "windows-1250",
522
+ "cp1251": "windows-1251",
523
+ "cp1252": "Windows-1252",
524
+ "cp1253": "windows-1253",
525
+ "cp1255": "windows-1255",
526
+ "cp1256": "windows-1256",
527
+ "cp1254": "Windows-1254",
528
+ "cp949": "CP949",
529
+ }
530
+
531
+
532
+ COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
533
+ "<",
534
+ ">",
535
+ "=",
536
+ ":",
537
+ "/",
538
+ "&",
539
+ ";",
540
+ "{",
541
+ "}",
542
+ "[",
543
+ "]",
544
+ ",",
545
+ "|",
546
+ '"',
547
+ "-",
548
+ "(",
549
+ ")",
550
+ }
551
+
552
+
553
+ KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
554
+ ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
555
+
556
+ # Logging LEVEL below DEBUG
557
+ TRACE: int = 5
558
+
559
+
560
+ # Language label that contain the em dash "—"
561
+ # character are to be considered alternative seq to origin
562
+ FREQUENCIES: dict[str, list[str]] = {
563
+ "English": [
564
+ "e",
565
+ "a",
566
+ "t",
567
+ "i",
568
+ "o",
569
+ "n",
570
+ "s",
571
+ "r",
572
+ "h",
573
+ "l",
574
+ "d",
575
+ "c",
576
+ "u",
577
+ "m",
578
+ "f",
579
+ "p",
580
+ "g",
581
+ "w",
582
+ "y",
583
+ "b",
584
+ "v",
585
+ "k",
586
+ "x",
587
+ "j",
588
+ "z",
589
+ "q",
590
+ ],
591
+ "English—": [
592
+ "e",
593
+ "a",
594
+ "t",
595
+ "i",
596
+ "o",
597
+ "n",
598
+ "s",
599
+ "r",
600
+ "h",
601
+ "l",
602
+ "d",
603
+ "c",
604
+ "m",
605
+ "u",
606
+ "f",
607
+ "p",
608
+ "g",
609
+ "w",
610
+ "b",
611
+ "y",
612
+ "v",
613
+ "k",
614
+ "j",
615
+ "x",
616
+ "z",
617
+ "q",
618
+ ],
619
+ "German": [
620
+ "e",
621
+ "n",
622
+ "i",
623
+ "r",
624
+ "s",
625
+ "t",
626
+ "a",
627
+ "d",
628
+ "h",
629
+ "u",
630
+ "l",
631
+ "g",
632
+ "o",
633
+ "c",
634
+ "m",
635
+ "b",
636
+ "f",
637
+ "k",
638
+ "w",
639
+ "z",
640
+ "p",
641
+ "v",
642
+ "ü",
643
+ "ä",
644
+ "ö",
645
+ "j",
646
+ ],
647
+ "French": [
648
+ "e",
649
+ "a",
650
+ "s",
651
+ "n",
652
+ "i",
653
+ "t",
654
+ "r",
655
+ "l",
656
+ "u",
657
+ "o",
658
+ "d",
659
+ "c",
660
+ "p",
661
+ "m",
662
+ "é",
663
+ "v",
664
+ "g",
665
+ "f",
666
+ "b",
667
+ "h",
668
+ "q",
669
+ "à",
670
+ "x",
671
+ "è",
672
+ "y",
673
+ "j",
674
+ ],
675
+ "Dutch": [
676
+ "e",
677
+ "n",
678
+ "a",
679
+ "i",
680
+ "r",
681
+ "t",
682
+ "o",
683
+ "d",
684
+ "s",
685
+ "l",
686
+ "g",
687
+ "h",
688
+ "v",
689
+ "m",
690
+ "u",
691
+ "k",
692
+ "c",
693
+ "p",
694
+ "b",
695
+ "w",
696
+ "j",
697
+ "z",
698
+ "f",
699
+ "y",
700
+ "x",
701
+ "ë",
702
+ ],
703
+ "Italian": [
704
+ "e",
705
+ "i",
706
+ "a",
707
+ "o",
708
+ "n",
709
+ "l",
710
+ "t",
711
+ "r",
712
+ "s",
713
+ "c",
714
+ "d",
715
+ "u",
716
+ "p",
717
+ "m",
718
+ "g",
719
+ "v",
720
+ "f",
721
+ "b",
722
+ "z",
723
+ "h",
724
+ "q",
725
+ "è",
726
+ "à",
727
+ "k",
728
+ "y",
729
+ "ò",
730
+ ],
731
+ "Polish": [
732
+ "a",
733
+ "i",
734
+ "o",
735
+ "e",
736
+ "n",
737
+ "r",
738
+ "z",
739
+ "w",
740
+ "s",
741
+ "c",
742
+ "t",
743
+ "k",
744
+ "y",
745
+ "d",
746
+ "p",
747
+ "m",
748
+ "u",
749
+ "l",
750
+ "j",
751
+ "ł",
752
+ "g",
753
+ "b",
754
+ "h",
755
+ "ą",
756
+ "ę",
757
+ "ó",
758
+ ],
759
+ "Spanish": [
760
+ "e",
761
+ "a",
762
+ "o",
763
+ "n",
764
+ "s",
765
+ "r",
766
+ "i",
767
+ "l",
768
+ "d",
769
+ "t",
770
+ "c",
771
+ "u",
772
+ "m",
773
+ "p",
774
+ "b",
775
+ "g",
776
+ "v",
777
+ "f",
778
+ "y",
779
+ "ó",
780
+ "h",
781
+ "q",
782
+ "í",
783
+ "j",
784
+ "z",
785
+ "á",
786
+ ],
787
+ "Russian": [
788
+ "о",
789
+ "а",
790
+ "е",
791
+ "и",
792
+ "н",
793
+ "с",
794
+ "т",
795
+ "р",
796
+ "в",
797
+ "л",
798
+ "к",
799
+ "м",
800
+ "д",
801
+ "п",
802
+ "у",
803
+ "г",
804
+ "я",
805
+ "ы",
806
+ "з",
807
+ "б",
808
+ "й",
809
+ "ь",
810
+ "ч",
811
+ "х",
812
+ "ж",
813
+ "ц",
814
+ ],
815
+ # Jap-Kanji
816
+ "Japanese": [
817
+ "人",
818
+ "一",
819
+ "大",
820
+ "亅",
821
+ "丁",
822
+ "丨",
823
+ "竹",
824
+ "笑",
825
+ "口",
826
+ "日",
827
+ "今",
828
+ "二",
829
+ "彳",
830
+ "行",
831
+ "十",
832
+ "土",
833
+ "丶",
834
+ "寸",
835
+ "寺",
836
+ "時",
837
+ "乙",
838
+ "丿",
839
+ "乂",
840
+ "气",
841
+ "気",
842
+ "冂",
843
+ "巾",
844
+ "亠",
845
+ "市",
846
+ "目",
847
+ "儿",
848
+ "見",
849
+ "八",
850
+ "小",
851
+ "凵",
852
+ "県",
853
+ "月",
854
+ "彐",
855
+ "門",
856
+ "間",
857
+ "木",
858
+ "東",
859
+ "山",
860
+ "出",
861
+ "本",
862
+ "中",
863
+ "刀",
864
+ "分",
865
+ "耳",
866
+ "又",
867
+ "取",
868
+ "最",
869
+ "言",
870
+ "田",
871
+ "心",
872
+ "思",
873
+ "刂",
874
+ "前",
875
+ "京",
876
+ "尹",
877
+ "事",
878
+ "生",
879
+ "厶",
880
+ "云",
881
+ "会",
882
+ "未",
883
+ "来",
884
+ "白",
885
+ "冫",
886
+ "楽",
887
+ "灬",
888
+ "馬",
889
+ "尸",
890
+ "尺",
891
+ "駅",
892
+ "明",
893
+ "耂",
894
+ "者",
895
+ "了",
896
+ "阝",
897
+ "都",
898
+ "高",
899
+ "卜",
900
+ "占",
901
+ "厂",
902
+ "广",
903
+ "店",
904
+ "子",
905
+ "申",
906
+ "奄",
907
+ "亻",
908
+ "俺",
909
+ "上",
910
+ "方",
911
+ "冖",
912
+ "学",
913
+ "衣",
914
+ "艮",
915
+ "食",
916
+ "自",
917
+ ],
918
+ # Jap-Katakana
919
+ "Japanese—": [
920
+ "ー",
921
+ "ン",
922
+ "ス",
923
+ "・",
924
+ "ル",
925
+ "ト",
926
+ "リ",
927
+ "イ",
928
+ "ア",
929
+ "ラ",
930
+ "ッ",
931
+ "ク",
932
+ "ド",
933
+ "シ",
934
+ "レ",
935
+ "ジ",
936
+ "タ",
937
+ "フ",
938
+ "ロ",
939
+ "カ",
940
+ "テ",
941
+ "マ",
942
+ "ィ",
943
+ "グ",
944
+ "バ",
945
+ "ム",
946
+ "プ",
947
+ "オ",
948
+ "コ",
949
+ "デ",
950
+ "ニ",
951
+ "ウ",
952
+ "メ",
953
+ "サ",
954
+ "ビ",
955
+ "ナ",
956
+ "ブ",
957
+ "ャ",
958
+ "エ",
959
+ "ュ",
960
+ "チ",
961
+ "キ",
962
+ "ズ",
963
+ "ダ",
964
+ "パ",
965
+ "ミ",
966
+ "ェ",
967
+ "ョ",
968
+ "ハ",
969
+ "セ",
970
+ "ベ",
971
+ "ガ",
972
+ "モ",
973
+ "ツ",
974
+ "ネ",
975
+ "ボ",
976
+ "ソ",
977
+ "ノ",
978
+ "ァ",
979
+ "ヴ",
980
+ "ワ",
981
+ "ポ",
982
+ "ペ",
983
+ "ピ",
984
+ "ケ",
985
+ "ゴ",
986
+ "ギ",
987
+ "ザ",
988
+ "ホ",
989
+ "ゲ",
990
+ "ォ",
991
+ "ヤ",
992
+ "ヒ",
993
+ "ユ",
994
+ "ヨ",
995
+ "ヘ",
996
+ "ゼ",
997
+ "ヌ",
998
+ "ゥ",
999
+ "ゾ",
1000
+ "ヶ",
1001
+ "ヂ",
1002
+ "ヲ",
1003
+ "ヅ",
1004
+ "ヵ",
1005
+ "ヱ",
1006
+ "ヰ",
1007
+ "ヮ",
1008
+ "ヽ",
1009
+ "゠",
1010
+ "ヾ",
1011
+ "ヷ",
1012
+ "ヿ",
1013
+ "ヸ",
1014
+ "ヹ",
1015
+ "ヺ",
1016
+ ],
1017
+ # Jap-Hiragana
1018
+ "Japanese——": [
1019
+ "の",
1020
+ "に",
1021
+ "る",
1022
+ "た",
1023
+ "と",
1024
+ "は",
1025
+ "し",
1026
+ "い",
1027
+ "を",
1028
+ "で",
1029
+ "て",
1030
+ "が",
1031
+ "な",
1032
+ "れ",
1033
+ "か",
1034
+ "ら",
1035
+ "さ",
1036
+ "っ",
1037
+ "り",
1038
+ "す",
1039
+ "あ",
1040
+ "も",
1041
+ "こ",
1042
+ "ま",
1043
+ "う",
1044
+ "く",
1045
+ "よ",
1046
+ "き",
1047
+ "ん",
1048
+ "め",
1049
+ "お",
1050
+ "け",
1051
+ "そ",
1052
+ "つ",
1053
+ "だ",
1054
+ "や",
1055
+ "え",
1056
+ "ど",
1057
+ "わ",
1058
+ "ち",
1059
+ "み",
1060
+ "せ",
1061
+ "じ",
1062
+ "ば",
1063
+ "へ",
1064
+ "び",
1065
+ "ず",
1066
+ "ろ",
1067
+ "ほ",
1068
+ "げ",
1069
+ "む",
1070
+ "べ",
1071
+ "ひ",
1072
+ "ょ",
1073
+ "ゆ",
1074
+ "ぶ",
1075
+ "ご",
1076
+ "ゃ",
1077
+ "ね",
1078
+ "ふ",
1079
+ "ぐ",
1080
+ "ぎ",
1081
+ "ぼ",
1082
+ "ゅ",
1083
+ "づ",
1084
+ "ざ",
1085
+ "ぞ",
1086
+ "ぬ",
1087
+ "ぜ",
1088
+ "ぱ",
1089
+ "ぽ",
1090
+ "ぷ",
1091
+ "ぴ",
1092
+ "ぃ",
1093
+ "ぁ",
1094
+ "ぇ",
1095
+ "ぺ",
1096
+ "ゞ",
1097
+ "ぢ",
1098
+ "ぉ",
1099
+ "ぅ",
1100
+ "ゐ",
1101
+ "ゝ",
1102
+ "ゑ",
1103
+ "゛",
1104
+ "゜",
1105
+ "ゎ",
1106
+ "ゔ",
1107
+ "゚",
1108
+ "ゟ",
1109
+ "゙",
1110
+ "ゕ",
1111
+ "ゖ",
1112
+ ],
1113
+ "Portuguese": [
1114
+ "a",
1115
+ "e",
1116
+ "o",
1117
+ "s",
1118
+ "i",
1119
+ "r",
1120
+ "d",
1121
+ "n",
1122
+ "t",
1123
+ "m",
1124
+ "u",
1125
+ "c",
1126
+ "l",
1127
+ "p",
1128
+ "g",
1129
+ "v",
1130
+ "b",
1131
+ "f",
1132
+ "h",
1133
+ "ã",
1134
+ "q",
1135
+ "é",
1136
+ "ç",
1137
+ "á",
1138
+ "z",
1139
+ "í",
1140
+ ],
1141
+ "Swedish": [
1142
+ "e",
1143
+ "a",
1144
+ "n",
1145
+ "r",
1146
+ "t",
1147
+ "s",
1148
+ "i",
1149
+ "l",
1150
+ "d",
1151
+ "o",
1152
+ "m",
1153
+ "k",
1154
+ "g",
1155
+ "v",
1156
+ "h",
1157
+ "f",
1158
+ "u",
1159
+ "p",
1160
+ "ä",
1161
+ "c",
1162
+ "b",
1163
+ "ö",
1164
+ "å",
1165
+ "y",
1166
+ "j",
1167
+ "x",
1168
+ ],
1169
+ "Chinese": [
1170
+ "的",
1171
+ "一",
1172
+ "是",
1173
+ "不",
1174
+ "了",
1175
+ "在",
1176
+ "人",
1177
+ "有",
1178
+ "我",
1179
+ "他",
1180
+ "这",
1181
+ "个",
1182
+ "们",
1183
+ "中",
1184
+ "来",
1185
+ "上",
1186
+ "大",
1187
+ "为",
1188
+ "和",
1189
+ "国",
1190
+ "地",
1191
+ "到",
1192
+ "以",
1193
+ "说",
1194
+ "时",
1195
+ "要",
1196
+ "就",
1197
+ "出",
1198
+ "会",
1199
+ "可",
1200
+ "也",
1201
+ "你",
1202
+ "对",
1203
+ "生",
1204
+ "能",
1205
+ "而",
1206
+ "子",
1207
+ "那",
1208
+ "得",
1209
+ "于",
1210
+ "着",
1211
+ "下",
1212
+ "自",
1213
+ "之",
1214
+ "年",
1215
+ "过",
1216
+ "发",
1217
+ "后",
1218
+ "作",
1219
+ "里",
1220
+ "用",
1221
+ "道",
1222
+ "行",
1223
+ "所",
1224
+ "然",
1225
+ "家",
1226
+ "种",
1227
+ "事",
1228
+ "成",
1229
+ "方",
1230
+ "多",
1231
+ "经",
1232
+ "么",
1233
+ "去",
1234
+ "法",
1235
+ "学",
1236
+ "如",
1237
+ "都",
1238
+ "同",
1239
+ "现",
1240
+ "当",
1241
+ "没",
1242
+ "动",
1243
+ "面",
1244
+ "起",
1245
+ "看",
1246
+ "定",
1247
+ "天",
1248
+ "分",
1249
+ "还",
1250
+ "进",
1251
+ "好",
1252
+ "小",
1253
+ "部",
1254
+ "其",
1255
+ "些",
1256
+ "主",
1257
+ "样",
1258
+ "理",
1259
+ "心",
1260
+ "她",
1261
+ "本",
1262
+ "前",
1263
+ "开",
1264
+ "但",
1265
+ "因",
1266
+ "只",
1267
+ "从",
1268
+ "想",
1269
+ "实",
1270
+ ],
1271
+ "Ukrainian": [
1272
+ "о",
1273
+ "а",
1274
+ "н",
1275
+ "і",
1276
+ "и",
1277
+ "р",
1278
+ "в",
1279
+ "т",
1280
+ "е",
1281
+ "с",
1282
+ "к",
1283
+ "л",
1284
+ "у",
1285
+ "д",
1286
+ "м",
1287
+ "п",
1288
+ "з",
1289
+ "я",
1290
+ "ь",
1291
+ "б",
1292
+ "г",
1293
+ "й",
1294
+ "ч",
1295
+ "х",
1296
+ "ц",
1297
+ "ї",
1298
+ ],
1299
+ "Norwegian": [
1300
+ "e",
1301
+ "r",
1302
+ "n",
1303
+ "t",
1304
+ "a",
1305
+ "s",
1306
+ "i",
1307
+ "o",
1308
+ "l",
1309
+ "d",
1310
+ "g",
1311
+ "k",
1312
+ "m",
1313
+ "v",
1314
+ "f",
1315
+ "p",
1316
+ "u",
1317
+ "b",
1318
+ "h",
1319
+ "å",
1320
+ "y",
1321
+ "j",
1322
+ "ø",
1323
+ "c",
1324
+ "æ",
1325
+ "w",
1326
+ ],
1327
+ "Finnish": [
1328
+ "a",
1329
+ "i",
1330
+ "n",
1331
+ "t",
1332
+ "e",
1333
+ "s",
1334
+ "l",
1335
+ "o",
1336
+ "u",
1337
+ "k",
1338
+ "ä",
1339
+ "m",
1340
+ "r",
1341
+ "v",
1342
+ "j",
1343
+ "h",
1344
+ "p",
1345
+ "y",
1346
+ "d",
1347
+ "ö",
1348
+ "g",
1349
+ "c",
1350
+ "b",
1351
+ "f",
1352
+ "w",
1353
+ "z",
1354
+ ],
1355
+ "Vietnamese": [
1356
+ "n",
1357
+ "h",
1358
+ "t",
1359
+ "i",
1360
+ "c",
1361
+ "g",
1362
+ "a",
1363
+ "o",
1364
+ "u",
1365
+ "m",
1366
+ "l",
1367
+ "r",
1368
+ "à",
1369
+ "đ",
1370
+ "s",
1371
+ "e",
1372
+ "v",
1373
+ "p",
1374
+ "b",
1375
+ "y",
1376
+ "ư",
1377
+ "d",
1378
+ "á",
1379
+ "k",
1380
+ "ộ",
1381
+ "ế",
1382
+ ],
1383
+ "Czech": [
1384
+ "o",
1385
+ "e",
1386
+ "a",
1387
+ "n",
1388
+ "t",
1389
+ "s",
1390
+ "i",
1391
+ "l",
1392
+ "v",
1393
+ "r",
1394
+ "k",
1395
+ "d",
1396
+ "u",
1397
+ "m",
1398
+ "p",
1399
+ "í",
1400
+ "c",
1401
+ "h",
1402
+ "z",
1403
+ "á",
1404
+ "y",
1405
+ "j",
1406
+ "b",
1407
+ "ě",
1408
+ "é",
1409
+ "ř",
1410
+ ],
1411
+ "Hungarian": [
1412
+ "e",
1413
+ "a",
1414
+ "t",
1415
+ "l",
1416
+ "s",
1417
+ "n",
1418
+ "k",
1419
+ "r",
1420
+ "i",
1421
+ "o",
1422
+ "z",
1423
+ "á",
1424
+ "é",
1425
+ "g",
1426
+ "m",
1427
+ "b",
1428
+ "y",
1429
+ "v",
1430
+ "d",
1431
+ "h",
1432
+ "u",
1433
+ "p",
1434
+ "j",
1435
+ "ö",
1436
+ "f",
1437
+ "c",
1438
+ ],
1439
+ "Korean": [
1440
+ "이",
1441
+ "다",
1442
+ "에",
1443
+ "의",
1444
+ "는",
1445
+ "로",
1446
+ "하",
1447
+ "을",
1448
+ "가",
1449
+ "고",
1450
+ "지",
1451
+ "서",
1452
+ "한",
1453
+ "은",
1454
+ "기",
1455
+ "으",
1456
+ "년",
1457
+ "대",
1458
+ "사",
1459
+ "시",
1460
+ "를",
1461
+ "리",
1462
+ "도",
1463
+ "인",
1464
+ "스",
1465
+ "일",
1466
+ ],
1467
+ "Indonesian": [
1468
+ "a",
1469
+ "n",
1470
+ "e",
1471
+ "i",
1472
+ "r",
1473
+ "t",
1474
+ "u",
1475
+ "s",
1476
+ "d",
1477
+ "k",
1478
+ "m",
1479
+ "l",
1480
+ "g",
1481
+ "p",
1482
+ "b",
1483
+ "o",
1484
+ "h",
1485
+ "y",
1486
+ "j",
1487
+ "c",
1488
+ "w",
1489
+ "f",
1490
+ "v",
1491
+ "z",
1492
+ "x",
1493
+ "q",
1494
+ ],
1495
+ "Turkish": [
1496
+ "a",
1497
+ "e",
1498
+ "i",
1499
+ "n",
1500
+ "r",
1501
+ "l",
1502
+ "ı",
1503
+ "k",
1504
+ "d",
1505
+ "t",
1506
+ "s",
1507
+ "m",
1508
+ "y",
1509
+ "u",
1510
+ "o",
1511
+ "b",
1512
+ "ü",
1513
+ "ş",
1514
+ "v",
1515
+ "g",
1516
+ "z",
1517
+ "h",
1518
+ "c",
1519
+ "p",
1520
+ "ç",
1521
+ "ğ",
1522
+ ],
1523
+ "Romanian": [
1524
+ "e",
1525
+ "i",
1526
+ "a",
1527
+ "r",
1528
+ "n",
1529
+ "t",
1530
+ "u",
1531
+ "l",
1532
+ "o",
1533
+ "c",
1534
+ "s",
1535
+ "d",
1536
+ "p",
1537
+ "m",
1538
+ "ă",
1539
+ "f",
1540
+ "v",
1541
+ "î",
1542
+ "g",
1543
+ "b",
1544
+ "ș",
1545
+ "ț",
1546
+ "z",
1547
+ "h",
1548
+ "â",
1549
+ "j",
1550
+ ],
1551
+ "Farsi": [
1552
+ "ا",
1553
+ "ی",
1554
+ "ر",
1555
+ "د",
1556
+ "ن",
1557
+ "ه",
1558
+ "و",
1559
+ "م",
1560
+ "ت",
1561
+ "ب",
1562
+ "س",
1563
+ "ل",
1564
+ "ک",
1565
+ "ش",
1566
+ "ز",
1567
+ "ف",
1568
+ "گ",
1569
+ "ع",
1570
+ "خ",
1571
+ "ق",
1572
+ "ج",
1573
+ "آ",
1574
+ "پ",
1575
+ "ح",
1576
+ "ط",
1577
+ "ص",
1578
+ ],
1579
+ "Arabic": [
1580
+ "ا",
1581
+ "ل",
1582
+ "ي",
1583
+ "م",
1584
+ "و",
1585
+ "ن",
1586
+ "ر",
1587
+ "ت",
1588
+ "ب",
1589
+ "ة",
1590
+ "ع",
1591
+ "د",
1592
+ "س",
1593
+ "ف",
1594
+ "ه",
1595
+ "ك",
1596
+ "ق",
1597
+ "أ",
1598
+ "ح",
1599
+ "ج",
1600
+ "ش",
1601
+ "ط",
1602
+ "ص",
1603
+ "ى",
1604
+ "خ",
1605
+ "إ",
1606
+ ],
1607
+ "Danish": [
1608
+ "e",
1609
+ "r",
1610
+ "n",
1611
+ "t",
1612
+ "a",
1613
+ "i",
1614
+ "s",
1615
+ "d",
1616
+ "l",
1617
+ "o",
1618
+ "g",
1619
+ "m",
1620
+ "k",
1621
+ "f",
1622
+ "v",
1623
+ "u",
1624
+ "b",
1625
+ "h",
1626
+ "p",
1627
+ "å",
1628
+ "y",
1629
+ "ø",
1630
+ "æ",
1631
+ "c",
1632
+ "j",
1633
+ "w",
1634
+ ],
1635
+ "Serbian": [
1636
+ "а",
1637
+ "и",
1638
+ "о",
1639
+ "е",
1640
+ "н",
1641
+ "р",
1642
+ "с",
1643
+ "у",
1644
+ "т",
1645
+ "к",
1646
+ "ј",
1647
+ "в",
1648
+ "д",
1649
+ "м",
1650
+ "п",
1651
+ "л",
1652
+ "г",
1653
+ "з",
1654
+ "б",
1655
+ "a",
1656
+ "i",
1657
+ "e",
1658
+ "o",
1659
+ "n",
1660
+ "ц",
1661
+ "ш",
1662
+ ],
1663
+ "Lithuanian": [
1664
+ "i",
1665
+ "a",
1666
+ "s",
1667
+ "o",
1668
+ "r",
1669
+ "e",
1670
+ "t",
1671
+ "n",
1672
+ "u",
1673
+ "k",
1674
+ "m",
1675
+ "l",
1676
+ "p",
1677
+ "v",
1678
+ "d",
1679
+ "j",
1680
+ "g",
1681
+ "ė",
1682
+ "b",
1683
+ "y",
1684
+ "ų",
1685
+ "š",
1686
+ "ž",
1687
+ "c",
1688
+ "ą",
1689
+ "į",
1690
+ ],
1691
+ "Slovene": [
1692
+ "e",
1693
+ "a",
1694
+ "i",
1695
+ "o",
1696
+ "n",
1697
+ "r",
1698
+ "s",
1699
+ "l",
1700
+ "t",
1701
+ "j",
1702
+ "v",
1703
+ "k",
1704
+ "d",
1705
+ "p",
1706
+ "m",
1707
+ "u",
1708
+ "z",
1709
+ "b",
1710
+ "g",
1711
+ "h",
1712
+ "č",
1713
+ "c",
1714
+ "š",
1715
+ "ž",
1716
+ "f",
1717
+ "y",
1718
+ ],
1719
+ "Slovak": [
1720
+ "o",
1721
+ "a",
1722
+ "e",
1723
+ "n",
1724
+ "i",
1725
+ "r",
1726
+ "v",
1727
+ "t",
1728
+ "s",
1729
+ "l",
1730
+ "k",
1731
+ "d",
1732
+ "m",
1733
+ "p",
1734
+ "u",
1735
+ "c",
1736
+ "h",
1737
+ "j",
1738
+ "b",
1739
+ "z",
1740
+ "á",
1741
+ "y",
1742
+ "ý",
1743
+ "í",
1744
+ "č",
1745
+ "é",
1746
+ ],
1747
+ "Hebrew": [
1748
+ "י",
1749
+ "ו",
1750
+ "ה",
1751
+ "ל",
1752
+ "ר",
1753
+ "ב",
1754
+ "ת",
1755
+ "מ",
1756
+ "א",
1757
+ "ש",
1758
+ "נ",
1759
+ "ע",
1760
+ "ם",
1761
+ "ד",
1762
+ "ק",
1763
+ "ח",
1764
+ "פ",
1765
+ "ס",
1766
+ "כ",
1767
+ "ג",
1768
+ "ט",
1769
+ "צ",
1770
+ "ן",
1771
+ "ז",
1772
+ "ך",
1773
+ ],
1774
+ "Bulgarian": [
1775
+ "а",
1776
+ "и",
1777
+ "о",
1778
+ "е",
1779
+ "н",
1780
+ "т",
1781
+ "р",
1782
+ "с",
1783
+ "в",
1784
+ "л",
1785
+ "к",
1786
+ "д",
1787
+ "п",
1788
+ "м",
1789
+ "з",
1790
+ "г",
1791
+ "я",
1792
+ "ъ",
1793
+ "у",
1794
+ "б",
1795
+ "ч",
1796
+ "ц",
1797
+ "й",
1798
+ "ж",
1799
+ "щ",
1800
+ "х",
1801
+ ],
1802
+ "Croatian": [
1803
+ "a",
1804
+ "i",
1805
+ "o",
1806
+ "e",
1807
+ "n",
1808
+ "r",
1809
+ "j",
1810
+ "s",
1811
+ "t",
1812
+ "u",
1813
+ "k",
1814
+ "l",
1815
+ "v",
1816
+ "d",
1817
+ "m",
1818
+ "p",
1819
+ "g",
1820
+ "z",
1821
+ "b",
1822
+ "c",
1823
+ "č",
1824
+ "h",
1825
+ "š",
1826
+ "ž",
1827
+ "ć",
1828
+ "f",
1829
+ ],
1830
+ "Hindi": [
1831
+ "क",
1832
+ "र",
1833
+ "स",
1834
+ "न",
1835
+ "त",
1836
+ "म",
1837
+ "ह",
1838
+ "प",
1839
+ "य",
1840
+ "ल",
1841
+ "व",
1842
+ "ज",
1843
+ "द",
1844
+ "ग",
1845
+ "ब",
1846
+ "श",
1847
+ "ट",
1848
+ "अ",
1849
+ "ए",
1850
+ "थ",
1851
+ "भ",
1852
+ "ड",
1853
+ "च",
1854
+ "ध",
1855
+ "ष",
1856
+ "इ",
1857
+ ],
1858
+ "Estonian": [
1859
+ "a",
1860
+ "i",
1861
+ "e",
1862
+ "s",
1863
+ "t",
1864
+ "l",
1865
+ "u",
1866
+ "n",
1867
+ "o",
1868
+ "k",
1869
+ "r",
1870
+ "d",
1871
+ "m",
1872
+ "v",
1873
+ "g",
1874
+ "p",
1875
+ "j",
1876
+ "h",
1877
+ "ä",
1878
+ "b",
1879
+ "õ",
1880
+ "ü",
1881
+ "f",
1882
+ "c",
1883
+ "ö",
1884
+ "y",
1885
+ ],
1886
+ "Thai": [
1887
+ "า",
1888
+ "น",
1889
+ "ร",
1890
+ "อ",
1891
+ "ก",
1892
+ "เ",
1893
+ "ง",
1894
+ "ม",
1895
+ "ย",
1896
+ "ล",
1897
+ "ว",
1898
+ "ด",
1899
+ "ท",
1900
+ "ส",
1901
+ "ต",
1902
+ "ะ",
1903
+ "ป",
1904
+ "บ",
1905
+ "ค",
1906
+ "ห",
1907
+ "แ",
1908
+ "จ",
1909
+ "พ",
1910
+ "ช",
1911
+ "ข",
1912
+ "ใ",
1913
+ ],
1914
+ "Greek": [
1915
+ "α",
1916
+ "τ",
1917
+ "ο",
1918
+ "ι",
1919
+ "ε",
1920
+ "ν",
1921
+ "ρ",
1922
+ "σ",
1923
+ "κ",
1924
+ "η",
1925
+ "π",
1926
+ "ς",
1927
+ "υ",
1928
+ "μ",
1929
+ "λ",
1930
+ "ί",
1931
+ "ό",
1932
+ "ά",
1933
+ "γ",
1934
+ "έ",
1935
+ "δ",
1936
+ "ή",
1937
+ "ω",
1938
+ "χ",
1939
+ "θ",
1940
+ "ύ",
1941
+ ],
1942
+ "Tamil": [
1943
+ "க",
1944
+ "த",
1945
+ "ப",
1946
+ "ட",
1947
+ "ர",
1948
+ "ம",
1949
+ "ல",
1950
+ "ன",
1951
+ "வ",
1952
+ "ற",
1953
+ "ய",
1954
+ "ள",
1955
+ "ச",
1956
+ "ந",
1957
+ "இ",
1958
+ "ண",
1959
+ "அ",
1960
+ "ஆ",
1961
+ "ழ",
1962
+ "ங",
1963
+ "எ",
1964
+ "உ",
1965
+ "ஒ",
1966
+ "ஸ",
1967
+ ],
1968
+ "Kazakh": [
1969
+ "а",
1970
+ "ы",
1971
+ "е",
1972
+ "н",
1973
+ "т",
1974
+ "р",
1975
+ "л",
1976
+ "і",
1977
+ "д",
1978
+ "с",
1979
+ "м",
1980
+ "қ",
1981
+ "к",
1982
+ "о",
1983
+ "б",
1984
+ "и",
1985
+ "у",
1986
+ "ғ",
1987
+ "ж",
1988
+ "ң",
1989
+ "з",
1990
+ "ш",
1991
+ "й",
1992
+ "п",
1993
+ "г",
1994
+ "ө",
1995
+ ],
1996
+ }
1997
+
1998
+ LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
env/Lib/site-packages/charset_normalizer/legacy.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+ from warnings import warn
5
+
6
+ from .api import from_bytes
7
+ from .constant import CHARDET_CORRESPONDENCE
8
+
9
+ # TODO: remove this check when dropping Python 3.7 support
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import TypedDict
12
+
13
+ class ResultDict(TypedDict):
14
+ encoding: str | None
15
+ language: str
16
+ confidence: float | None
17
+
18
+
19
+ def detect(
20
+ byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
21
+ ) -> ResultDict:
22
+ """
23
+ chardet legacy method
24
+ Detect the encoding of the given byte string. It should be mostly backward-compatible.
25
+ Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
26
+ This function is deprecated and should be used to migrate your project easily, consult the documentation for
27
+ further information. Not planned for removal.
28
+
29
+ :param byte_str: The byte sequence to examine.
30
+ :param should_rename_legacy: Should we rename legacy encodings
31
+ to their more modern equivalents?
32
+ """
33
+ if len(kwargs):
34
+ warn(
35
+ f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
36
+ )
37
+
38
+ if not isinstance(byte_str, (bytearray, bytes)):
39
+ raise TypeError( # pragma: nocover
40
+ "Expected object of type bytes or bytearray, got: " "{}".format(
41
+ type(byte_str)
42
+ )
43
+ )
44
+
45
+ if isinstance(byte_str, bytearray):
46
+ byte_str = bytes(byte_str)
47
+
48
+ r = from_bytes(byte_str).best()
49
+
50
+ encoding = r.encoding if r is not None else None
51
+ language = r.language if r is not None and r.language != "Unknown" else ""
52
+ confidence = 1.0 - r.chaos if r is not None else None
53
+
54
+ # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
55
+ # but chardet does return 'utf-8-sig' and it is a valid codec name.
56
+ if r is not None and encoding == "utf_8" and r.bom:
57
+ encoding += "_sig"
58
+
59
+ if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
60
+ encoding = CHARDET_CORRESPONDENCE[encoding]
61
+
62
+ return {
63
+ "encoding": encoding,
64
+ "language": language,
65
+ "confidence": confidence,
66
+ }
env/Lib/site-packages/charset_normalizer/md.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from logging import getLogger
5
+
6
+ from .constant import (
7
+ COMMON_SAFE_ASCII_CHARACTERS,
8
+ TRACE,
9
+ UNICODE_SECONDARY_RANGE_KEYWORD,
10
+ )
11
+ from .utils import (
12
+ is_accentuated,
13
+ is_arabic,
14
+ is_arabic_isolated_form,
15
+ is_case_variable,
16
+ is_cjk,
17
+ is_emoticon,
18
+ is_hangul,
19
+ is_hiragana,
20
+ is_katakana,
21
+ is_latin,
22
+ is_punctuation,
23
+ is_separator,
24
+ is_symbol,
25
+ is_thai,
26
+ is_unprintable,
27
+ remove_accent,
28
+ unicode_range,
29
+ )
30
+
31
+
32
+ class MessDetectorPlugin:
33
+ """
34
+ Base abstract class used for mess detection plugins.
35
+ All detectors MUST extend and implement given methods.
36
+ """
37
+
38
+ def eligible(self, character: str) -> bool:
39
+ """
40
+ Determine if given character should be fed in.
41
+ """
42
+ raise NotImplementedError # pragma: nocover
43
+
44
+ def feed(self, character: str) -> None:
45
+ """
46
+ The main routine to be executed upon character.
47
+ Insert the logic in witch the text would be considered chaotic.
48
+ """
49
+ raise NotImplementedError # pragma: nocover
50
+
51
+ def reset(self) -> None: # pragma: no cover
52
+ """
53
+ Permit to reset the plugin to the initial state.
54
+ """
55
+ raise NotImplementedError
56
+
57
+ @property
58
+ def ratio(self) -> float:
59
+ """
60
+ Compute the chaos ratio based on what your feed() has seen.
61
+ Must NOT be lower than 0.; No restriction gt 0.
62
+ """
63
+ raise NotImplementedError # pragma: nocover
64
+
65
+
66
+ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
67
+ def __init__(self) -> None:
68
+ self._punctuation_count: int = 0
69
+ self._symbol_count: int = 0
70
+ self._character_count: int = 0
71
+
72
+ self._last_printable_char: str | None = None
73
+ self._frenzy_symbol_in_word: bool = False
74
+
75
+ def eligible(self, character: str) -> bool:
76
+ return character.isprintable()
77
+
78
+ def feed(self, character: str) -> None:
79
+ self._character_count += 1
80
+
81
+ if (
82
+ character != self._last_printable_char
83
+ and character not in COMMON_SAFE_ASCII_CHARACTERS
84
+ ):
85
+ if is_punctuation(character):
86
+ self._punctuation_count += 1
87
+ elif (
88
+ character.isdigit() is False
89
+ and is_symbol(character)
90
+ and is_emoticon(character) is False
91
+ ):
92
+ self._symbol_count += 2
93
+
94
+ self._last_printable_char = character
95
+
96
+ def reset(self) -> None: # Abstract
97
+ self._punctuation_count = 0
98
+ self._character_count = 0
99
+ self._symbol_count = 0
100
+
101
+ @property
102
+ def ratio(self) -> float:
103
+ if self._character_count == 0:
104
+ return 0.0
105
+
106
+ ratio_of_punctuation: float = (
107
+ self._punctuation_count + self._symbol_count
108
+ ) / self._character_count
109
+
110
+ return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
111
+
112
+
113
+ class TooManyAccentuatedPlugin(MessDetectorPlugin):
114
+ def __init__(self) -> None:
115
+ self._character_count: int = 0
116
+ self._accentuated_count: int = 0
117
+
118
+ def eligible(self, character: str) -> bool:
119
+ return character.isalpha()
120
+
121
+ def feed(self, character: str) -> None:
122
+ self._character_count += 1
123
+
124
+ if is_accentuated(character):
125
+ self._accentuated_count += 1
126
+
127
+ def reset(self) -> None: # Abstract
128
+ self._character_count = 0
129
+ self._accentuated_count = 0
130
+
131
+ @property
132
+ def ratio(self) -> float:
133
+ if self._character_count < 8:
134
+ return 0.0
135
+
136
+ ratio_of_accentuation: float = self._accentuated_count / self._character_count
137
+ return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
138
+
139
+
140
+ class UnprintablePlugin(MessDetectorPlugin):
141
+ def __init__(self) -> None:
142
+ self._unprintable_count: int = 0
143
+ self._character_count: int = 0
144
+
145
+ def eligible(self, character: str) -> bool:
146
+ return True
147
+
148
+ def feed(self, character: str) -> None:
149
+ if is_unprintable(character):
150
+ self._unprintable_count += 1
151
+ self._character_count += 1
152
+
153
+ def reset(self) -> None: # Abstract
154
+ self._unprintable_count = 0
155
+
156
+ @property
157
+ def ratio(self) -> float:
158
+ if self._character_count == 0:
159
+ return 0.0
160
+
161
+ return (self._unprintable_count * 8) / self._character_count
162
+
163
+
164
+ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
165
+ def __init__(self) -> None:
166
+ self._successive_count: int = 0
167
+ self._character_count: int = 0
168
+
169
+ self._last_latin_character: str | None = None
170
+
171
+ def eligible(self, character: str) -> bool:
172
+ return character.isalpha() and is_latin(character)
173
+
174
+ def feed(self, character: str) -> None:
175
+ self._character_count += 1
176
+ if (
177
+ self._last_latin_character is not None
178
+ and is_accentuated(character)
179
+ and is_accentuated(self._last_latin_character)
180
+ ):
181
+ if character.isupper() and self._last_latin_character.isupper():
182
+ self._successive_count += 1
183
+ # Worse if its the same char duplicated with different accent.
184
+ if remove_accent(character) == remove_accent(self._last_latin_character):
185
+ self._successive_count += 1
186
+ self._last_latin_character = character
187
+
188
+ def reset(self) -> None: # Abstract
189
+ self._successive_count = 0
190
+ self._character_count = 0
191
+ self._last_latin_character = None
192
+
193
+ @property
194
+ def ratio(self) -> float:
195
+ if self._character_count == 0:
196
+ return 0.0
197
+
198
+ return (self._successive_count * 2) / self._character_count
199
+
200
+
201
+ class SuspiciousRange(MessDetectorPlugin):
202
+ def __init__(self) -> None:
203
+ self._suspicious_successive_range_count: int = 0
204
+ self._character_count: int = 0
205
+ self._last_printable_seen: str | None = None
206
+
207
+ def eligible(self, character: str) -> bool:
208
+ return character.isprintable()
209
+
210
+ def feed(self, character: str) -> None:
211
+ self._character_count += 1
212
+
213
+ if (
214
+ character.isspace()
215
+ or is_punctuation(character)
216
+ or character in COMMON_SAFE_ASCII_CHARACTERS
217
+ ):
218
+ self._last_printable_seen = None
219
+ return
220
+
221
+ if self._last_printable_seen is None:
222
+ self._last_printable_seen = character
223
+ return
224
+
225
+ unicode_range_a: str | None = unicode_range(self._last_printable_seen)
226
+ unicode_range_b: str | None = unicode_range(character)
227
+
228
+ if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
229
+ self._suspicious_successive_range_count += 1
230
+
231
+ self._last_printable_seen = character
232
+
233
+ def reset(self) -> None: # Abstract
234
+ self._character_count = 0
235
+ self._suspicious_successive_range_count = 0
236
+ self._last_printable_seen = None
237
+
238
+ @property
239
+ def ratio(self) -> float:
240
+ if self._character_count <= 13:
241
+ return 0.0
242
+
243
+ ratio_of_suspicious_range_usage: float = (
244
+ self._suspicious_successive_range_count * 2
245
+ ) / self._character_count
246
+
247
+ return ratio_of_suspicious_range_usage
248
+
249
+
250
+ class SuperWeirdWordPlugin(MessDetectorPlugin):
251
+ def __init__(self) -> None:
252
+ self._word_count: int = 0
253
+ self._bad_word_count: int = 0
254
+ self._foreign_long_count: int = 0
255
+
256
+ self._is_current_word_bad: bool = False
257
+ self._foreign_long_watch: bool = False
258
+
259
+ self._character_count: int = 0
260
+ self._bad_character_count: int = 0
261
+
262
+ self._buffer: str = ""
263
+ self._buffer_accent_count: int = 0
264
+ self._buffer_glyph_count: int = 0
265
+
266
+ def eligible(self, character: str) -> bool:
267
+ return True
268
+
269
+ def feed(self, character: str) -> None:
270
+ if character.isalpha():
271
+ self._buffer += character
272
+ if is_accentuated(character):
273
+ self._buffer_accent_count += 1
274
+ if (
275
+ self._foreign_long_watch is False
276
+ and (is_latin(character) is False or is_accentuated(character))
277
+ and is_cjk(character) is False
278
+ and is_hangul(character) is False
279
+ and is_katakana(character) is False
280
+ and is_hiragana(character) is False
281
+ and is_thai(character) is False
282
+ ):
283
+ self._foreign_long_watch = True
284
+ if (
285
+ is_cjk(character)
286
+ or is_hangul(character)
287
+ or is_katakana(character)
288
+ or is_hiragana(character)
289
+ or is_thai(character)
290
+ ):
291
+ self._buffer_glyph_count += 1
292
+ return
293
+ if not self._buffer:
294
+ return
295
+ if (
296
+ character.isspace() or is_punctuation(character) or is_separator(character)
297
+ ) and self._buffer:
298
+ self._word_count += 1
299
+ buffer_length: int = len(self._buffer)
300
+
301
+ self._character_count += buffer_length
302
+
303
+ if buffer_length >= 4:
304
+ if self._buffer_accent_count / buffer_length >= 0.5:
305
+ self._is_current_word_bad = True
306
+ # Word/Buffer ending with an upper case accentuated letter are so rare,
307
+ # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
308
+ elif (
309
+ is_accentuated(self._buffer[-1])
310
+ and self._buffer[-1].isupper()
311
+ and all(_.isupper() for _ in self._buffer) is False
312
+ ):
313
+ self._foreign_long_count += 1
314
+ self._is_current_word_bad = True
315
+ elif self._buffer_glyph_count == 1:
316
+ self._is_current_word_bad = True
317
+ self._foreign_long_count += 1
318
+ if buffer_length >= 24 and self._foreign_long_watch:
319
+ camel_case_dst = [
320
+ i
321
+ for c, i in zip(self._buffer, range(0, buffer_length))
322
+ if c.isupper()
323
+ ]
324
+ probable_camel_cased: bool = False
325
+
326
+ if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
327
+ probable_camel_cased = True
328
+
329
+ if not probable_camel_cased:
330
+ self._foreign_long_count += 1
331
+ self._is_current_word_bad = True
332
+
333
+ if self._is_current_word_bad:
334
+ self._bad_word_count += 1
335
+ self._bad_character_count += len(self._buffer)
336
+ self._is_current_word_bad = False
337
+
338
+ self._foreign_long_watch = False
339
+ self._buffer = ""
340
+ self._buffer_accent_count = 0
341
+ self._buffer_glyph_count = 0
342
+ elif (
343
+ character not in {"<", ">", "-", "=", "~", "|", "_"}
344
+ and character.isdigit() is False
345
+ and is_symbol(character)
346
+ ):
347
+ self._is_current_word_bad = True
348
+ self._buffer += character
349
+
350
+ def reset(self) -> None: # Abstract
351
+ self._buffer = ""
352
+ self._is_current_word_bad = False
353
+ self._foreign_long_watch = False
354
+ self._bad_word_count = 0
355
+ self._word_count = 0
356
+ self._character_count = 0
357
+ self._bad_character_count = 0
358
+ self._foreign_long_count = 0
359
+
360
+ @property
361
+ def ratio(self) -> float:
362
+ if self._word_count <= 10 and self._foreign_long_count == 0:
363
+ return 0.0
364
+
365
+ return self._bad_character_count / self._character_count
366
+
367
+
368
+ class CjkInvalidStopPlugin(MessDetectorPlugin):
369
+ """
370
+ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
371
+ can be easily detected. Searching for the overuse of '丅' and '丄'.
372
+ """
373
+
374
+ def __init__(self) -> None:
375
+ self._wrong_stop_count: int = 0
376
+ self._cjk_character_count: int = 0
377
+
378
+ def eligible(self, character: str) -> bool:
379
+ return True
380
+
381
+ def feed(self, character: str) -> None:
382
+ if character in {"丅", "丄"}:
383
+ self._wrong_stop_count += 1
384
+ return
385
+ if is_cjk(character):
386
+ self._cjk_character_count += 1
387
+
388
+ def reset(self) -> None: # Abstract
389
+ self._wrong_stop_count = 0
390
+ self._cjk_character_count = 0
391
+
392
+ @property
393
+ def ratio(self) -> float:
394
+ if self._cjk_character_count < 16:
395
+ return 0.0
396
+ return self._wrong_stop_count / self._cjk_character_count
397
+
398
+
399
+ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
400
+ def __init__(self) -> None:
401
+ self._buf: bool = False
402
+
403
+ self._character_count_since_last_sep: int = 0
404
+
405
+ self._successive_upper_lower_count: int = 0
406
+ self._successive_upper_lower_count_final: int = 0
407
+
408
+ self._character_count: int = 0
409
+
410
+ self._last_alpha_seen: str | None = None
411
+ self._current_ascii_only: bool = True
412
+
413
+ def eligible(self, character: str) -> bool:
414
+ return True
415
+
416
+ def feed(self, character: str) -> None:
417
+ is_concerned = character.isalpha() and is_case_variable(character)
418
+ chunk_sep = is_concerned is False
419
+
420
+ if chunk_sep and self._character_count_since_last_sep > 0:
421
+ if (
422
+ self._character_count_since_last_sep <= 64
423
+ and character.isdigit() is False
424
+ and self._current_ascii_only is False
425
+ ):
426
+ self._successive_upper_lower_count_final += (
427
+ self._successive_upper_lower_count
428
+ )
429
+
430
+ self._successive_upper_lower_count = 0
431
+ self._character_count_since_last_sep = 0
432
+ self._last_alpha_seen = None
433
+ self._buf = False
434
+ self._character_count += 1
435
+ self._current_ascii_only = True
436
+
437
+ return
438
+
439
+ if self._current_ascii_only is True and character.isascii() is False:
440
+ self._current_ascii_only = False
441
+
442
+ if self._last_alpha_seen is not None:
443
+ if (character.isupper() and self._last_alpha_seen.islower()) or (
444
+ character.islower() and self._last_alpha_seen.isupper()
445
+ ):
446
+ if self._buf is True:
447
+ self._successive_upper_lower_count += 2
448
+ self._buf = False
449
+ else:
450
+ self._buf = True
451
+ else:
452
+ self._buf = False
453
+
454
+ self._character_count += 1
455
+ self._character_count_since_last_sep += 1
456
+ self._last_alpha_seen = character
457
+
458
+ def reset(self) -> None: # Abstract
459
+ self._character_count = 0
460
+ self._character_count_since_last_sep = 0
461
+ self._successive_upper_lower_count = 0
462
+ self._successive_upper_lower_count_final = 0
463
+ self._last_alpha_seen = None
464
+ self._buf = False
465
+ self._current_ascii_only = True
466
+
467
+ @property
468
+ def ratio(self) -> float:
469
+ if self._character_count == 0:
470
+ return 0.0
471
+
472
+ return self._successive_upper_lower_count_final / self._character_count
473
+
474
+
475
+ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
476
+ def __init__(self) -> None:
477
+ self._character_count: int = 0
478
+ self._isolated_form_count: int = 0
479
+
480
+ def reset(self) -> None: # Abstract
481
+ self._character_count = 0
482
+ self._isolated_form_count = 0
483
+
484
+ def eligible(self, character: str) -> bool:
485
+ return is_arabic(character)
486
+
487
+ def feed(self, character: str) -> None:
488
+ self._character_count += 1
489
+
490
+ if is_arabic_isolated_form(character):
491
+ self._isolated_form_count += 1
492
+
493
+ @property
494
+ def ratio(self) -> float:
495
+ if self._character_count < 8:
496
+ return 0.0
497
+
498
+ isolated_form_usage: float = self._isolated_form_count / self._character_count
499
+
500
+ return isolated_form_usage
501
+
502
+
503
+ @lru_cache(maxsize=1024)
504
+ def is_suspiciously_successive_range(
505
+ unicode_range_a: str | None, unicode_range_b: str | None
506
+ ) -> bool:
507
+ """
508
+ Determine if two Unicode range seen next to each other can be considered as suspicious.
509
+ """
510
+ if unicode_range_a is None or unicode_range_b is None:
511
+ return True
512
+
513
+ if unicode_range_a == unicode_range_b:
514
+ return False
515
+
516
+ if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
517
+ return False
518
+
519
+ if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
520
+ return False
521
+
522
+ # Latin characters can be accompanied with a combining diacritical mark
523
+ # eg. Vietnamese.
524
+ if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
525
+ "Combining" in unicode_range_a or "Combining" in unicode_range_b
526
+ ):
527
+ return False
528
+
529
+ keywords_range_a, keywords_range_b = (
530
+ unicode_range_a.split(" "),
531
+ unicode_range_b.split(" "),
532
+ )
533
+
534
+ for el in keywords_range_a:
535
+ if el in UNICODE_SECONDARY_RANGE_KEYWORD:
536
+ continue
537
+ if el in keywords_range_b:
538
+ return False
539
+
540
+ # Japanese Exception
541
+ range_a_jp_chars, range_b_jp_chars = (
542
+ unicode_range_a
543
+ in (
544
+ "Hiragana",
545
+ "Katakana",
546
+ ),
547
+ unicode_range_b in ("Hiragana", "Katakana"),
548
+ )
549
+ if (range_a_jp_chars or range_b_jp_chars) and (
550
+ "CJK" in unicode_range_a or "CJK" in unicode_range_b
551
+ ):
552
+ return False
553
+ if range_a_jp_chars and range_b_jp_chars:
554
+ return False
555
+
556
+ if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
557
+ if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
558
+ return False
559
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
560
+ return False
561
+
562
+ # Chinese/Japanese use dedicated range for punctuation and/or separators.
563
+ if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
564
+ unicode_range_a in ["Katakana", "Hiragana"]
565
+ and unicode_range_b in ["Katakana", "Hiragana"]
566
+ ):
567
+ if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
568
+ return False
569
+ if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
570
+ return False
571
+ if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
572
+ return False
573
+
574
+ return True
575
+
576
+
577
+ @lru_cache(maxsize=2048)
578
+ def mess_ratio(
579
+ decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
580
+ ) -> float:
581
+ """
582
+ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
583
+ """
584
+
585
+ detectors: list[MessDetectorPlugin] = [
586
+ md_class() for md_class in MessDetectorPlugin.__subclasses__()
587
+ ]
588
+
589
+ length: int = len(decoded_sequence) + 1
590
+
591
+ mean_mess_ratio: float = 0.0
592
+
593
+ if length < 512:
594
+ intermediary_mean_mess_ratio_calc: int = 32
595
+ elif length <= 1024:
596
+ intermediary_mean_mess_ratio_calc = 64
597
+ else:
598
+ intermediary_mean_mess_ratio_calc = 128
599
+
600
+ for character, index in zip(decoded_sequence + "\n", range(length)):
601
+ for detector in detectors:
602
+ if detector.eligible(character):
603
+ detector.feed(character)
604
+
605
+ if (
606
+ index > 0 and index % intermediary_mean_mess_ratio_calc == 0
607
+ ) or index == length - 1:
608
+ mean_mess_ratio = sum(dt.ratio for dt in detectors)
609
+
610
+ if mean_mess_ratio >= maximum_threshold:
611
+ break
612
+
613
+ if debug:
614
+ logger = getLogger("charset_normalizer")
615
+
616
+ logger.log(
617
+ TRACE,
618
+ "Mess-detector extended-analysis start. "
619
+ f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
620
+ f"maximum_threshold={maximum_threshold}",
621
+ )
622
+
623
+ if len(decoded_sequence) > 16:
624
+ logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
625
+ logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
626
+
627
+ for dt in detectors:
628
+ logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
629
+
630
+ return round(mean_mess_ratio, 3)
env/Lib/site-packages/charset_normalizer/models.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from encodings.aliases import aliases
4
+ from hashlib import sha256
5
+ from json import dumps
6
+ from re import sub
7
+ from typing import Any, Iterator, List, Tuple
8
+
9
+ from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
10
+ from .utils import iana_name, is_multi_byte_encoding, unicode_range
11
+
12
+
13
+ class CharsetMatch:
14
+ def __init__(
15
+ self,
16
+ payload: bytes,
17
+ guessed_encoding: str,
18
+ mean_mess_ratio: float,
19
+ has_sig_or_bom: bool,
20
+ languages: CoherenceMatches,
21
+ decoded_payload: str | None = None,
22
+ preemptive_declaration: str | None = None,
23
+ ):
24
+ self._payload: bytes = payload
25
+
26
+ self._encoding: str = guessed_encoding
27
+ self._mean_mess_ratio: float = mean_mess_ratio
28
+ self._languages: CoherenceMatches = languages
29
+ self._has_sig_or_bom: bool = has_sig_or_bom
30
+ self._unicode_ranges: list[str] | None = None
31
+
32
+ self._leaves: list[CharsetMatch] = []
33
+ self._mean_coherence_ratio: float = 0.0
34
+
35
+ self._output_payload: bytes | None = None
36
+ self._output_encoding: str | None = None
37
+
38
+ self._string: str | None = decoded_payload
39
+
40
+ self._preemptive_declaration: str | None = preemptive_declaration
41
+
42
+ def __eq__(self, other: object) -> bool:
43
+ if not isinstance(other, CharsetMatch):
44
+ if isinstance(other, str):
45
+ return iana_name(other) == self.encoding
46
+ return False
47
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
48
+
49
+ def __lt__(self, other: object) -> bool:
50
+ """
51
+ Implemented to make sorted available upon CharsetMatches items.
52
+ """
53
+ if not isinstance(other, CharsetMatch):
54
+ raise ValueError
55
+
56
+ chaos_difference: float = abs(self.chaos - other.chaos)
57
+ coherence_difference: float = abs(self.coherence - other.coherence)
58
+
59
+ # Below 1% difference --> Use Coherence
60
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
61
+ return self.coherence > other.coherence
62
+ elif chaos_difference < 0.01 and coherence_difference <= 0.02:
63
+ # When having a difficult decision, use the result that decoded as many multi-byte as possible.
64
+ # preserve RAM usage!
65
+ if len(self._payload) >= TOO_BIG_SEQUENCE:
66
+ return self.chaos < other.chaos
67
+ return self.multi_byte_usage > other.multi_byte_usage
68
+
69
+ return self.chaos < other.chaos
70
+
71
+ @property
72
+ def multi_byte_usage(self) -> float:
73
+ return 1.0 - (len(str(self)) / len(self.raw))
74
+
75
+ def __str__(self) -> str:
76
+ # Lazy Str Loading
77
+ if self._string is None:
78
+ self._string = str(self._payload, self._encoding, "strict")
79
+ return self._string
80
+
81
+ def __repr__(self) -> str:
82
+ return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
83
+
84
+ def add_submatch(self, other: CharsetMatch) -> None:
85
+ if not isinstance(other, CharsetMatch) or other == self:
86
+ raise ValueError(
87
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
88
+ other.__class__
89
+ )
90
+ )
91
+
92
+ other._string = None # Unload RAM usage; dirty trick.
93
+ self._leaves.append(other)
94
+
95
+ @property
96
+ def encoding(self) -> str:
97
+ return self._encoding
98
+
99
+ @property
100
+ def encoding_aliases(self) -> list[str]:
101
+ """
102
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
103
+ """
104
+ also_known_as: list[str] = []
105
+ for u, p in aliases.items():
106
+ if self.encoding == u:
107
+ also_known_as.append(p)
108
+ elif self.encoding == p:
109
+ also_known_as.append(u)
110
+ return also_known_as
111
+
112
+ @property
113
+ def bom(self) -> bool:
114
+ return self._has_sig_or_bom
115
+
116
+ @property
117
+ def byte_order_mark(self) -> bool:
118
+ return self._has_sig_or_bom
119
+
120
+ @property
121
+ def languages(self) -> list[str]:
122
+ """
123
+ Return the complete list of possible languages found in decoded sequence.
124
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
125
+ """
126
+ return [e[0] for e in self._languages]
127
+
128
+ @property
129
+ def language(self) -> str:
130
+ """
131
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
132
+ "Unknown".
133
+ """
134
+ if not self._languages:
135
+ # Trying to infer the language based on the given encoding
136
+ # Its either English or we should not pronounce ourselves in certain cases.
137
+ if "ascii" in self.could_be_from_charset:
138
+ return "English"
139
+
140
+ # doing it there to avoid circular import
141
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
142
+
143
+ languages = (
144
+ mb_encoding_languages(self.encoding)
145
+ if is_multi_byte_encoding(self.encoding)
146
+ else encoding_languages(self.encoding)
147
+ )
148
+
149
+ if len(languages) == 0 or "Latin Based" in languages:
150
+ return "Unknown"
151
+
152
+ return languages[0]
153
+
154
+ return self._languages[0][0]
155
+
156
+ @property
157
+ def chaos(self) -> float:
158
+ return self._mean_mess_ratio
159
+
160
+ @property
161
+ def coherence(self) -> float:
162
+ if not self._languages:
163
+ return 0.0
164
+ return self._languages[0][1]
165
+
166
+ @property
167
+ def percent_chaos(self) -> float:
168
+ return round(self.chaos * 100, ndigits=3)
169
+
170
+ @property
171
+ def percent_coherence(self) -> float:
172
+ return round(self.coherence * 100, ndigits=3)
173
+
174
+ @property
175
+ def raw(self) -> bytes:
176
+ """
177
+ Original untouched bytes.
178
+ """
179
+ return self._payload
180
+
181
+ @property
182
+ def submatch(self) -> list[CharsetMatch]:
183
+ return self._leaves
184
+
185
+ @property
186
+ def has_submatch(self) -> bool:
187
+ return len(self._leaves) > 0
188
+
189
+ @property
190
+ def alphabets(self) -> list[str]:
191
+ if self._unicode_ranges is not None:
192
+ return self._unicode_ranges
193
+ # list detected ranges
194
+ detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
195
+ # filter and sort
196
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
197
+ return self._unicode_ranges
198
+
199
+ @property
200
+ def could_be_from_charset(self) -> list[str]:
201
+ """
202
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
203
+ encoding.
204
+ This list does include the encoding available in property 'encoding'.
205
+ """
206
+ return [self._encoding] + [m.encoding for m in self._leaves]
207
+
208
+ def output(self, encoding: str = "utf_8") -> bytes:
209
+ """
210
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
211
+ Any errors will be simply ignored by the encoder NOT replaced.
212
+ """
213
+ if self._output_encoding is None or self._output_encoding != encoding:
214
+ self._output_encoding = encoding
215
+ decoded_string = str(self)
216
+ if (
217
+ self._preemptive_declaration is not None
218
+ and self._preemptive_declaration.lower()
219
+ not in ["utf-8", "utf8", "utf_8"]
220
+ ):
221
+ patched_header = sub(
222
+ RE_POSSIBLE_ENCODING_INDICATION,
223
+ lambda m: m.string[m.span()[0] : m.span()[1]].replace(
224
+ m.groups()[0],
225
+ iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
226
+ ),
227
+ decoded_string[:8192],
228
+ count=1,
229
+ )
230
+
231
+ decoded_string = patched_header + decoded_string[8192:]
232
+
233
+ self._output_payload = decoded_string.encode(encoding, "replace")
234
+
235
+ return self._output_payload # type: ignore
236
+
237
+ @property
238
+ def fingerprint(self) -> str:
239
+ """
240
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
241
+ """
242
+ return sha256(self.output()).hexdigest()
243
+
244
+
245
+ class CharsetMatches:
246
+ """
247
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
248
+ Act like a list(iterable) but does not implements all related methods.
249
+ """
250
+
251
+ def __init__(self, results: list[CharsetMatch] | None = None):
252
+ self._results: list[CharsetMatch] = sorted(results) if results else []
253
+
254
+ def __iter__(self) -> Iterator[CharsetMatch]:
255
+ yield from self._results
256
+
257
+ def __getitem__(self, item: int | str) -> CharsetMatch:
258
+ """
259
+ Retrieve a single item either by its position or encoding name (alias may be used here).
260
+ Raise KeyError upon invalid index or encoding not present in results.
261
+ """
262
+ if isinstance(item, int):
263
+ return self._results[item]
264
+ if isinstance(item, str):
265
+ item = iana_name(item, False)
266
+ for result in self._results:
267
+ if item in result.could_be_from_charset:
268
+ return result
269
+ raise KeyError
270
+
271
+ def __len__(self) -> int:
272
+ return len(self._results)
273
+
274
+ def __bool__(self) -> bool:
275
+ return len(self._results) > 0
276
+
277
+ def append(self, item: CharsetMatch) -> None:
278
+ """
279
+ Insert a single match. Will be inserted accordingly to preserve sort.
280
+ Can be inserted as a submatch.
281
+ """
282
+ if not isinstance(item, CharsetMatch):
283
+ raise ValueError(
284
+ "Cannot append instance '{}' to CharsetMatches".format(
285
+ str(item.__class__)
286
+ )
287
+ )
288
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
289
+ if len(item.raw) < TOO_BIG_SEQUENCE:
290
+ for match in self._results:
291
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
292
+ match.add_submatch(item)
293
+ return
294
+ self._results.append(item)
295
+ self._results = sorted(self._results)
296
+
297
+ def best(self) -> CharsetMatch | None:
298
+ """
299
+ Simply return the first match. Strict equivalent to matches[0].
300
+ """
301
+ if not self._results:
302
+ return None
303
+ return self._results[0]
304
+
305
+ def first(self) -> CharsetMatch | None:
306
+ """
307
+ Redundant method, call the method best(). Kept for BC reasons.
308
+ """
309
+ return self.best()
310
+
311
+
312
+ CoherenceMatch = Tuple[str, float]
313
+ CoherenceMatches = List[CoherenceMatch]
314
+
315
+
316
+ class CliDetectionResult:
317
+ def __init__(
318
+ self,
319
+ path: str,
320
+ encoding: str | None,
321
+ encoding_aliases: list[str],
322
+ alternative_encodings: list[str],
323
+ language: str,
324
+ alphabets: list[str],
325
+ has_sig_or_bom: bool,
326
+ chaos: float,
327
+ coherence: float,
328
+ unicode_path: str | None,
329
+ is_preferred: bool,
330
+ ):
331
+ self.path: str = path
332
+ self.unicode_path: str | None = unicode_path
333
+ self.encoding: str | None = encoding
334
+ self.encoding_aliases: list[str] = encoding_aliases
335
+ self.alternative_encodings: list[str] = alternative_encodings
336
+ self.language: str = language
337
+ self.alphabets: list[str] = alphabets
338
+ self.has_sig_or_bom: bool = has_sig_or_bom
339
+ self.chaos: float = chaos
340
+ self.coherence: float = coherence
341
+ self.is_preferred: bool = is_preferred
342
+
343
+ @property
344
+ def __dict__(self) -> dict[str, Any]: # type: ignore
345
+ return {
346
+ "path": self.path,
347
+ "encoding": self.encoding,
348
+ "encoding_aliases": self.encoding_aliases,
349
+ "alternative_encodings": self.alternative_encodings,
350
+ "language": self.language,
351
+ "alphabets": self.alphabets,
352
+ "has_sig_or_bom": self.has_sig_or_bom,
353
+ "chaos": self.chaos,
354
+ "coherence": self.coherence,
355
+ "unicode_path": self.unicode_path,
356
+ "is_preferred": self.is_preferred,
357
+ }
358
+
359
+ def to_json(self) -> str:
360
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)
env/Lib/site-packages/charset_normalizer/py.typed ADDED
File without changes
env/Lib/site-packages/charset_normalizer/utils.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import unicodedata
6
+ from codecs import IncrementalDecoder
7
+ from encodings.aliases import aliases
8
+ from functools import lru_cache
9
+ from re import findall
10
+ from typing import Generator
11
+
12
+ from _multibytecodec import ( # type: ignore[import-not-found,import]
13
+ MultibyteIncrementalDecoder,
14
+ )
15
+
16
+ from .constant import (
17
+ ENCODING_MARKS,
18
+ IANA_SUPPORTED_SIMILAR,
19
+ RE_POSSIBLE_ENCODING_INDICATION,
20
+ UNICODE_RANGES_COMBINED,
21
+ UNICODE_SECONDARY_RANGE_KEYWORD,
22
+ UTF8_MAXIMAL_ALLOCATION,
23
+ )
24
+
25
+
26
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
27
+ def is_accentuated(character: str) -> bool:
28
+ try:
29
+ description: str = unicodedata.name(character)
30
+ except ValueError: # Defensive: unicode database outdated?
31
+ return False
32
+ return (
33
+ "WITH GRAVE" in description
34
+ or "WITH ACUTE" in description
35
+ or "WITH CEDILLA" in description
36
+ or "WITH DIAERESIS" in description
37
+ or "WITH CIRCUMFLEX" in description
38
+ or "WITH TILDE" in description
39
+ or "WITH MACRON" in description
40
+ or "WITH RING ABOVE" in description
41
+ )
42
+
43
+
44
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
45
+ def remove_accent(character: str) -> str:
46
+ decomposed: str = unicodedata.decomposition(character)
47
+ if not decomposed:
48
+ return character
49
+
50
+ codes: list[str] = decomposed.split(" ")
51
+
52
+ return chr(int(codes[0], 16))
53
+
54
+
55
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
56
+ def unicode_range(character: str) -> str | None:
57
+ """
58
+ Retrieve the Unicode range official name from a single character.
59
+ """
60
+ character_ord: int = ord(character)
61
+
62
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
63
+ if character_ord in ord_range:
64
+ return range_name
65
+
66
+ return None
67
+
68
+
69
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
70
+ def is_latin(character: str) -> bool:
71
+ try:
72
+ description: str = unicodedata.name(character)
73
+ except ValueError: # Defensive: unicode database outdated?
74
+ return False
75
+ return "LATIN" in description
76
+
77
+
78
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
79
+ def is_punctuation(character: str) -> bool:
80
+ character_category: str = unicodedata.category(character)
81
+
82
+ if "P" in character_category:
83
+ return True
84
+
85
+ character_range: str | None = unicode_range(character)
86
+
87
+ if character_range is None:
88
+ return False
89
+
90
+ return "Punctuation" in character_range
91
+
92
+
93
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
94
+ def is_symbol(character: str) -> bool:
95
+ character_category: str = unicodedata.category(character)
96
+
97
+ if "S" in character_category or "N" in character_category:
98
+ return True
99
+
100
+ character_range: str | None = unicode_range(character)
101
+
102
+ if character_range is None:
103
+ return False
104
+
105
+ return "Forms" in character_range and character_category != "Lo"
106
+
107
+
108
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
109
+ def is_emoticon(character: str) -> bool:
110
+ character_range: str | None = unicode_range(character)
111
+
112
+ if character_range is None:
113
+ return False
114
+
115
+ return "Emoticons" in character_range or "Pictographs" in character_range
116
+
117
+
118
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
119
+ def is_separator(character: str) -> bool:
120
+ if character.isspace() or character in {"|", "+", "<", ">"}:
121
+ return True
122
+
123
+ character_category: str = unicodedata.category(character)
124
+
125
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
126
+
127
+
128
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
129
+ def is_case_variable(character: str) -> bool:
130
+ return character.islower() != character.isupper()
131
+
132
+
133
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
134
+ def is_cjk(character: str) -> bool:
135
+ try:
136
+ character_name = unicodedata.name(character)
137
+ except ValueError: # Defensive: unicode database outdated?
138
+ return False
139
+
140
+ return "CJK" in character_name
141
+
142
+
143
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
144
+ def is_hiragana(character: str) -> bool:
145
+ try:
146
+ character_name = unicodedata.name(character)
147
+ except ValueError: # Defensive: unicode database outdated?
148
+ return False
149
+
150
+ return "HIRAGANA" in character_name
151
+
152
+
153
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
154
+ def is_katakana(character: str) -> bool:
155
+ try:
156
+ character_name = unicodedata.name(character)
157
+ except ValueError: # Defensive: unicode database outdated?
158
+ return False
159
+
160
+ return "KATAKANA" in character_name
161
+
162
+
163
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
164
+ def is_hangul(character: str) -> bool:
165
+ try:
166
+ character_name = unicodedata.name(character)
167
+ except ValueError: # Defensive: unicode database outdated?
168
+ return False
169
+
170
+ return "HANGUL" in character_name
171
+
172
+
173
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
174
+ def is_thai(character: str) -> bool:
175
+ try:
176
+ character_name = unicodedata.name(character)
177
+ except ValueError: # Defensive: unicode database outdated?
178
+ return False
179
+
180
+ return "THAI" in character_name
181
+
182
+
183
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
184
+ def is_arabic(character: str) -> bool:
185
+ try:
186
+ character_name = unicodedata.name(character)
187
+ except ValueError: # Defensive: unicode database outdated?
188
+ return False
189
+
190
+ return "ARABIC" in character_name
191
+
192
+
193
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
194
+ def is_arabic_isolated_form(character: str) -> bool:
195
+ try:
196
+ character_name = unicodedata.name(character)
197
+ except ValueError: # Defensive: unicode database outdated?
198
+ return False
199
+
200
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
201
+
202
+
203
+ @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
204
+ def is_unicode_range_secondary(range_name: str) -> bool:
205
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
206
+
207
+
208
+ @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
209
+ def is_unprintable(character: str) -> bool:
210
+ return (
211
+ character.isspace() is False # includes \n \t \r \v
212
+ and character.isprintable() is False
213
+ and character != "\x1a" # Why? Its the ASCII substitute character.
214
+ and character != "\ufeff" # bug discovered in Python,
215
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
216
+ )
217
+
218
+
219
+ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
220
+ """
221
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
222
+ """
223
+ if not isinstance(sequence, bytes):
224
+ raise TypeError
225
+
226
+ seq_len: int = len(sequence)
227
+
228
+ results: list[str] = findall(
229
+ RE_POSSIBLE_ENCODING_INDICATION,
230
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
231
+ )
232
+
233
+ if len(results) == 0:
234
+ return None
235
+
236
+ for specified_encoding in results:
237
+ specified_encoding = specified_encoding.lower().replace("-", "_")
238
+
239
+ encoding_alias: str
240
+ encoding_iana: str
241
+
242
+ for encoding_alias, encoding_iana in aliases.items():
243
+ if encoding_alias == specified_encoding:
244
+ return encoding_iana
245
+ if encoding_iana == specified_encoding:
246
+ return encoding_iana
247
+
248
+ return None
249
+
250
+
251
+ @lru_cache(maxsize=128)
252
+ def is_multi_byte_encoding(name: str) -> bool:
253
+ """
254
+ Verify is a specific encoding is a multi byte one based on it IANA name
255
+ """
256
+ return name in {
257
+ "utf_8",
258
+ "utf_8_sig",
259
+ "utf_16",
260
+ "utf_16_be",
261
+ "utf_16_le",
262
+ "utf_32",
263
+ "utf_32_le",
264
+ "utf_32_be",
265
+ "utf_7",
266
+ } or issubclass(
267
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
268
+ MultibyteIncrementalDecoder,
269
+ )
270
+
271
+
272
+ def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
273
+ """
274
+ Identify and extract SIG/BOM in given sequence.
275
+ """
276
+
277
+ for iana_encoding in ENCODING_MARKS:
278
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
279
+
280
+ if isinstance(marks, bytes):
281
+ marks = [marks]
282
+
283
+ for mark in marks:
284
+ if sequence.startswith(mark):
285
+ return iana_encoding, mark
286
+
287
+ return None, b""
288
+
289
+
290
+ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
291
+ return iana_encoding not in {"utf_16", "utf_32"}
292
+
293
+
294
+ def iana_name(cp_name: str, strict: bool = True) -> str:
295
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
296
+ cp_name = cp_name.lower().replace("-", "_")
297
+
298
+ encoding_alias: str
299
+ encoding_iana: str
300
+
301
+ for encoding_alias, encoding_iana in aliases.items():
302
+ if cp_name in [encoding_alias, encoding_iana]:
303
+ return encoding_iana
304
+
305
+ if strict:
306
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
307
+
308
+ return cp_name
309
+
310
+
311
+ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
312
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
313
+ return 0.0
314
+
315
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
316
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
317
+
318
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
319
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
320
+
321
+ character_match_count: int = 0
322
+
323
+ for i in range(255):
324
+ to_be_decoded: bytes = bytes([i])
325
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
326
+ character_match_count += 1
327
+
328
+ return character_match_count / 254
329
+
330
+
331
+ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
332
+ """
333
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
334
+ the function cp_similarity.
335
+ """
336
+ return (
337
+ iana_name_a in IANA_SUPPORTED_SIMILAR
338
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
339
+ )
340
+
341
+
342
+ def set_logging_handler(
343
+ name: str = "charset_normalizer",
344
+ level: int = logging.INFO,
345
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
346
+ ) -> None:
347
+ logger = logging.getLogger(name)
348
+ logger.setLevel(level)
349
+
350
+ handler = logging.StreamHandler()
351
+ handler.setFormatter(logging.Formatter(format_string))
352
+ logger.addHandler(handler)
353
+
354
+
355
+ def cut_sequence_chunks(
356
+ sequences: bytes,
357
+ encoding_iana: str,
358
+ offsets: range,
359
+ chunk_size: int,
360
+ bom_or_sig_available: bool,
361
+ strip_sig_or_bom: bool,
362
+ sig_payload: bytes,
363
+ is_multi_byte_decoder: bool,
364
+ decoded_payload: str | None = None,
365
+ ) -> Generator[str, None, None]:
366
+ if decoded_payload and is_multi_byte_decoder is False:
367
+ for i in offsets:
368
+ chunk = decoded_payload[i : i + chunk_size]
369
+ if not chunk:
370
+ break
371
+ yield chunk
372
+ else:
373
+ for i in offsets:
374
+ chunk_end = i + chunk_size
375
+ if chunk_end > len(sequences) + 8:
376
+ continue
377
+
378
+ cut_sequence = sequences[i : i + chunk_size]
379
+
380
+ if bom_or_sig_available and strip_sig_or_bom is False:
381
+ cut_sequence = sig_payload + cut_sequence
382
+
383
+ chunk = cut_sequence.decode(
384
+ encoding_iana,
385
+ errors="ignore" if is_multi_byte_decoder else "strict",
386
+ )
387
+
388
+ # multi-byte bad cutting detector and adjustment
389
+ # not the cleanest way to perform that fix but clever enough for now.
390
+ if is_multi_byte_decoder and i > 0:
391
+ chunk_partial_size_chk: int = min(chunk_size, 16)
392
+
393
+ if (
394
+ decoded_payload
395
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
396
+ ):
397
+ for j in range(i, i - 4, -1):
398
+ cut_sequence = sequences[j:chunk_end]
399
+
400
+ if bom_or_sig_available and strip_sig_or_bom is False:
401
+ cut_sequence = sig_payload + cut_sequence
402
+
403
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
404
+
405
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
406
+ break
407
+
408
+ yield chunk
env/Lib/site-packages/charset_normalizer/version.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Expose version
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __version__ = "3.4.1"
8
+ VERSION = __version__.split(".")
env/Lib/site-packages/colorama-0.4.6.dist-info/INSTALLER ADDED
@@ -0,0 +1 @@
 
 
1
+ pip
env/Lib/site-packages/colorama-0.4.6.dist-info/METADATA ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: colorama
3
+ Version: 0.4.6
4
+ Summary: Cross-platform colored terminal text.
5
+ Project-URL: Homepage, https://github.com/tartley/colorama
6
+ Author-email: Jonathan Hartley <[email protected]>
7
+ License-File: LICENSE.txt
8
+ Keywords: ansi,color,colour,crossplatform,terminal,text,windows,xplatform
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 2
16
+ Classifier: Programming Language :: Python :: 2.7
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: Implementation :: CPython
23
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
24
+ Classifier: Topic :: Terminals
25
+ Requires-Python: !=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7
26
+ Description-Content-Type: text/x-rst
27
+
28
+ .. image:: https://img.shields.io/pypi/v/colorama.svg
29
+ :target: https://pypi.org/project/colorama/
30
+ :alt: Latest Version
31
+
32
+ .. image:: https://img.shields.io/pypi/pyversions/colorama.svg
33
+ :target: https://pypi.org/project/colorama/
34
+ :alt: Supported Python versions
35
+
36
+ .. image:: https://github.com/tartley/colorama/actions/workflows/test.yml/badge.svg
37
+ :target: https://github.com/tartley/colorama/actions/workflows/test.yml
38
+ :alt: Build Status
39
+
40
+ Colorama
41
+ ========
42
+
43
+ Makes ANSI escape character sequences (for producing colored terminal text and
44
+ cursor positioning) work under MS Windows.
45
+
46
+ .. |donate| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif
47
+ :target: https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=2MZ9D2GMLYCUJ&item_name=Colorama&currency_code=USD
48
+ :alt: Donate with Paypal
49
+
50
+ `PyPI for releases <https://pypi.org/project/colorama/>`_ |
51
+ `Github for source <https://github.com/tartley/colorama>`_ |
52
+ `Colorama for enterprise on Tidelift <https://github.com/tartley/colorama/blob/master/ENTERPRISE.md>`_
53
+
54
+ If you find Colorama useful, please |donate| to the authors. Thank you!
55
+
56
+ Installation
57
+ ------------
58
+
59
+ Tested on CPython 2.7, 3.7, 3.8, 3.9 and 3.10 and Pypy 2.7 and 3.8.
60
+
61
+ No requirements other than the standard library.
62
+
63
+ .. code-block:: bash
64
+
65
+ pip install colorama
66
+ # or
67
+ conda install -c anaconda colorama
68
+
69
+ Description
70
+ -----------
71
+
72
+ ANSI escape character sequences have long been used to produce colored terminal
73
+ text and cursor positioning on Unix and Macs. Colorama makes this work on
74
+ Windows, too, by wrapping ``stdout``, stripping ANSI sequences it finds (which
75
+ would appear as gobbledygook in the output), and converting them into the
76
+ appropriate win32 calls to modify the state of the terminal. On other platforms,
77
+ Colorama does nothing.
78
+
79
+ This has the upshot of providing a simple cross-platform API for printing
80
+ colored terminal text from Python, and has the happy side-effect that existing
81
+ applications or libraries which use ANSI sequences to produce colored output on
82
+ Linux or Macs can now also work on Windows, simply by calling
83
+ ``colorama.just_fix_windows_console()`` (since v0.4.6) or ``colorama.init()``
84
+ (all versions, but may have other side-effects – see below).
85
+
86
+ An alternative approach is to install ``ansi.sys`` on Windows machines, which
87
+ provides the same behaviour for all applications running in terminals. Colorama
88
+ is intended for situations where that isn't easy (e.g., maybe your app doesn't
89
+ have an installer.)
90
+
91
+ Demo scripts in the source code repository print some colored text using
92
+ ANSI sequences. Compare their output under Gnome-terminal's built in ANSI
93
+ handling, versus on Windows Command-Prompt using Colorama:
94
+
95
+ .. image:: https://github.com/tartley/colorama/raw/master/screenshots/ubuntu-demo.png
96
+ :width: 661
97
+ :height: 357
98
+ :alt: ANSI sequences on Ubuntu under gnome-terminal.
99
+
100
+ .. image:: https://github.com/tartley/colorama/raw/master/screenshots/windows-demo.png
101
+ :width: 668
102
+ :height: 325
103
+ :alt: Same ANSI sequences on Windows, using Colorama.
104
+
105
+ These screenshots show that, on Windows, Colorama does not support ANSI 'dim
106
+ text'; it looks the same as 'normal text'.
107
+
108
+ Usage
109
+ -----
110
+
111
+ Initialisation
112
+ ..............
113
+
114
+ If the only thing you want from Colorama is to get ANSI escapes to work on
115
+ Windows, then run:
116
+
117
+ .. code-block:: python
118
+
119
+ from colorama import just_fix_windows_console
120
+ just_fix_windows_console()
121
+
122
+ If you're on a recent version of Windows 10 or better, and your stdout/stderr
123
+ are pointing to a Windows console, then this will flip the magic configuration
124
+ switch to enable Windows' built-in ANSI support.
125
+
126
+ If you're on an older version of Windows, and your stdout/stderr are pointing to
127
+ a Windows console, then this will wrap ``sys.stdout`` and/or ``sys.stderr`` in a
128
+ magic file object that intercepts ANSI escape sequences and issues the
129
+ appropriate Win32 calls to emulate them.
130
+
131
+ In all other circumstances, it does nothing whatsoever. Basically the idea is
132
+ that this makes Windows act like Unix with respect to ANSI escape handling.
133
+
134
+ It's safe to call this function multiple times. It's safe to call this function
135
+ on non-Windows platforms, but it won't do anything. It's safe to call this
136
+ function when one or both of your stdout/stderr are redirected to a file – it
137
+ won't do anything to those streams.
138
+
139
+ Alternatively, you can use the older interface with more features (but also more
140
+ potential footguns):
141
+
142
+ .. code-block:: python
143
+
144
+ from colorama import init
145
+ init()
146
+
147
+ This does the same thing as ``just_fix_windows_console``, except for the
148
+ following differences:
149
+
150
+ - It's not safe to call ``init`` multiple times; you can end up with multiple
151
+ layers of wrapping and broken ANSI support.
152
+
153
+ - Colorama will apply a heuristic to guess whether stdout/stderr support ANSI,
154
+ and if it thinks they don't, then it will wrap ``sys.stdout`` and
155
+ ``sys.stderr`` in a magic file object that strips out ANSI escape sequences
156
+ before printing them. This happens on all platforms, and can be convenient if
157
+ you want to write your code to emit ANSI escape sequences unconditionally, and
158
+ let Colorama decide whether they should actually be output. But note that
159
+ Colorama's heuristic is not particularly clever.
160
+
161
+ - ``init`` also accepts explicit keyword args to enable/disable various
162
+ functionality – see below.
163
+
164
+ To stop using Colorama before your program exits, simply call ``deinit()``.
165
+ This will restore ``stdout`` and ``stderr`` to their original values, so that
166
+ Colorama is disabled. To resume using Colorama again, call ``reinit()``; it is
167
+ cheaper than calling ``init()`` again (but does the same thing).
168
+
169
+ Most users should depend on ``colorama >= 0.4.6``, and use
170
+ ``just_fix_windows_console``. The old ``init`` interface will be supported
171
+ indefinitely for backwards compatibility, but we don't plan to fix any issues
172
+ with it, also for backwards compatibility.
173
+
174
+ Colored Output
175
+ ..............
176
+
177
+ Cross-platform printing of colored text can then be done using Colorama's
178
+ constant shorthand for ANSI escape sequences. These are deliberately
179
+ rudimentary, see below.
180
+
181
+ .. code-block:: python
182
+
183
+ from colorama import Fore, Back, Style
184
+ print(Fore.RED + 'some red text')
185
+ print(Back.GREEN + 'and with a green background')
186
+ print(Style.DIM + 'and in dim text')
187
+ print(Style.RESET_ALL)
188
+ print('back to normal now')
189
+
190
+ ...or simply by manually printing ANSI sequences from your own code:
191
+
192
+ .. code-block:: python
193
+
194
+ print('\033[31m' + 'some red text')
195
+ print('\033[39m') # and reset to default color
196
+
197
+ ...or, Colorama can be used in conjunction with existing ANSI libraries
198
+ such as the venerable `Termcolor <https://pypi.org/project/termcolor/>`_
199
+ the fabulous `Blessings <https://pypi.org/project/blessings/>`_,
200
+ or the incredible `_Rich <https://pypi.org/project/rich/>`_.
201
+
202
+ If you wish Colorama's Fore, Back and Style constants were more capable,
203
+ then consider using one of the above highly capable libraries to generate
204
+ colors, etc, and use Colorama just for its primary purpose: to convert
205
+ those ANSI sequences to also work on Windows:
206
+
207
+ SIMILARLY, do not send PRs adding the generation of new ANSI types to Colorama.
208
+ We are only interested in converting ANSI codes to win32 API calls, not
209
+ shortcuts like the above to generate ANSI characters.
210
+
211
+ .. code-block:: python
212
+
213
+ from colorama import just_fix_windows_console
214
+ from termcolor import colored
215
+
216
+ # use Colorama to make Termcolor work on Windows too
217
+ just_fix_windows_console()
218
+
219
+ # then use Termcolor for all colored text output
220
+ print(colored('Hello, World!', 'green', 'on_red'))
221
+
222
+ Available formatting constants are::
223
+
224
+ Fore: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
225
+ Back: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, RESET.
226
+ Style: DIM, NORMAL, BRIGHT, RESET_ALL
227
+
228
+ ``Style.RESET_ALL`` resets foreground, background, and brightness. Colorama will
229
+ perform this reset automatically on program exit.
230
+
231
+ These are fairly well supported, but not part of the standard::
232
+
233
+ Fore: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
234
+ Back: LIGHTBLACK_EX, LIGHTRED_EX, LIGHTGREEN_EX, LIGHTYELLOW_EX, LIGHTBLUE_EX, LIGHTMAGENTA_EX, LIGHTCYAN_EX, LIGHTWHITE_EX
235
+
236
+ Cursor Positioning
237
+ ..................
238
+
239
+ ANSI codes to reposition the cursor are supported. See ``demos/demo06.py`` for
240
+ an example of how to generate them.
241
+
242
+ Init Keyword Args
243
+ .................
244
+
245
+ ``init()`` accepts some ``**kwargs`` to override default behaviour.
246
+
247
+ init(autoreset=False):
248
+ If you find yourself repeatedly sending reset sequences to turn off color
249
+ changes at the end of every print, then ``init(autoreset=True)`` will
250
+ automate that:
251
+
252
+ .. code-block:: python
253
+
254
+ from colorama import init
255
+ init(autoreset=True)
256
+ print(Fore.RED + 'some red text')
257
+ print('automatically back to default color again')
258
+
259
+ init(strip=None):
260
+ Pass ``True`` or ``False`` to override whether ANSI codes should be
261
+ stripped from the output. The default behaviour is to strip if on Windows
262
+ or if output is redirected (not a tty).
263
+
264
+ init(convert=None):
265
+ Pass ``True`` or ``False`` to override whether to convert ANSI codes in the
266
+ output into win32 calls. The default behaviour is to convert if on Windows
267
+ and output is to a tty (terminal).
268
+
269
+ init(wrap=True):
270
+ On Windows, Colorama works by replacing ``sys.stdout`` and ``sys.stderr``
271
+ with proxy objects, which override the ``.write()`` method to do their work.
272
+ If this wrapping causes you problems, then this can be disabled by passing
273
+ ``init(wrap=False)``. The default behaviour is to wrap if ``autoreset`` or
274
+ ``strip`` or ``convert`` are True.
275
+
276
+ When wrapping is disabled, colored printing on non-Windows platforms will
277
+ continue to work as normal. To do cross-platform colored output, you can
278
+ use Colorama's ``AnsiToWin32`` proxy directly:
279
+
280
+ .. code-block:: python
281
+
282
+ import sys
283
+ from colorama import init, AnsiToWin32
284
+ init(wrap=False)
285
+ stream = AnsiToWin32(sys.stderr).stream
286
+
287
+ # Python 2
288
+ print >>stream, Fore.BLUE + 'blue text on stderr'
289
+
290
+ # Python 3
291
+ print(Fore.BLUE + 'blue text on stderr', file=stream)
292
+
293
+ Recognised ANSI Sequences
294
+ .........................
295
+
296
+ ANSI sequences generally take the form::
297
+
298
+ ESC [ <param> ; <param> ... <command>
299
+
300
+ Where ``<param>`` is an integer, and ``<command>`` is a single letter. Zero or
301
+ more params are passed to a ``<command>``. If no params are passed, it is
302
+ generally synonymous with passing a single zero. No spaces exist in the
303
+ sequence; they have been inserted here simply to read more easily.
304
+
305
+ The only ANSI sequences that Colorama converts into win32 calls are::
306
+
307
+ ESC [ 0 m # reset all (colors and brightness)
308
+ ESC [ 1 m # bright
309
+ ESC [ 2 m # dim (looks same as normal brightness)
310
+ ESC [ 22 m # normal brightness
311
+
312
+ # FOREGROUND:
313
+ ESC [ 30 m # black
314
+ ESC [ 31 m # red
315
+ ESC [ 32 m # green
316
+ ESC [ 33 m # yellow
317
+ ESC [ 34 m # blue
318
+ ESC [ 35 m # magenta
319
+ ESC [ 36 m # cyan
320
+ ESC [ 37 m # white
321
+ ESC [ 39 m # reset
322
+
323
+ # BACKGROUND
324
+ ESC [ 40 m # black
325
+ ESC [ 41 m # red
326
+ ESC [ 42 m # green
327
+ ESC [ 43 m # yellow
328
+ ESC [ 44 m # blue
329
+ ESC [ 45 m # magenta
330
+ ESC [ 46 m # cyan
331
+ ESC [ 47 m # white
332
+ ESC [ 49 m # reset
333
+
334
+ # cursor positioning
335
+ ESC [ y;x H # position cursor at x across, y down
336
+ ESC [ y;x f # position cursor at x across, y down
337
+ ESC [ n A # move cursor n lines up
338
+ ESC [ n B # move cursor n lines down
339
+ ESC [ n C # move cursor n characters forward
340
+ ESC [ n D # move cursor n characters backward
341
+
342
+ # clear the screen
343
+ ESC [ mode J # clear the screen
344
+
345
+ # clear the line
346
+ ESC [ mode K # clear the line
347
+
348
+ Multiple numeric params to the ``'m'`` command can be combined into a single
349
+ sequence::
350
+
351
+ ESC [ 36 ; 45 ; 1 m # bright cyan text on magenta background
352
+
353
+ All other ANSI sequences of the form ``ESC [ <param> ; <param> ... <command>``
354
+ are silently stripped from the output on Windows.
355
+
356
+ Any other form of ANSI sequence, such as single-character codes or alternative
357
+ initial characters, are not recognised or stripped. It would be cool to add
358
+ them though. Let me know if it would be useful for you, via the Issues on
359
+ GitHub.
360
+
361
+ Status & Known Problems
362
+ -----------------------
363
+
364
+ I've personally only tested it on Windows XP (CMD, Console2), Ubuntu
365
+ (gnome-terminal, xterm), and OS X.
366
+
367
+ Some valid ANSI sequences aren't recognised.
368
+
369
+ If you're hacking on the code, see `README-hacking.md`_. ESPECIALLY, see the
370
+ explanation there of why we do not want PRs that allow Colorama to generate new
371
+ types of ANSI codes.
372
+
373
+ See outstanding issues and wish-list:
374
+ https://github.com/tartley/colorama/issues
375
+
376
+ If anything doesn't work for you, or doesn't do what you expected or hoped for,
377
+ I'd love to hear about it on that issues list, would be delighted by patches,
378
+ and would be happy to grant commit access to anyone who submits a working patch
379
+ or two.
380
+
381
+ .. _README-hacking.md: README-hacking.md
382
+
383
+ License
384
+ -------
385
+
386
+ Copyright Jonathan Hartley & Arnon Yaari, 2013-2020. BSD 3-Clause license; see
387
+ LICENSE file.
388
+
389
+ Professional support
390
+ --------------------
391
+
392
+ .. |tideliftlogo| image:: https://cdn2.hubspot.net/hubfs/4008838/website/logos/logos_for_download/Tidelift_primary-shorthand-logo.png
393
+ :alt: Tidelift
394
+ :target: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
395
+
396
+ .. list-table::
397
+ :widths: 10 100
398
+
399
+ * - |tideliftlogo|
400
+ - Professional support for colorama is available as part of the
401
+ `Tidelift Subscription`_.
402
+ Tidelift gives software development teams a single source for purchasing
403
+ and maintaining their software, with professional grade assurances from
404
+ the experts who know it best, while seamlessly integrating with existing
405
+ tools.
406
+
407
+ .. _Tidelift Subscription: https://tidelift.com/subscription/pkg/pypi-colorama?utm_source=pypi-colorama&utm_medium=referral&utm_campaign=readme
408
+
409
+ Thanks
410
+ ------
411
+
412
+ See the CHANGELOG for more thanks!
413
+
414
+ * Marc Schlaich (schlamar) for a ``setup.py`` fix for Python2.5.
415
+ * Marc Abramowitz, reported & fixed a crash on exit with closed ``stdout``,
416
+ providing a solution to issue #7's setuptools/distutils debate,
417
+ and other fixes.
418
+ * User 'eryksun', for guidance on correctly instantiating ``ctypes.windll``.
419
+ * Matthew McCormick for politely pointing out a longstanding crash on non-Win.
420
+ * Ben Hoyt, for a magnificent fix under 64-bit Windows.
421
+ * Jesse at Empty Square for submitting a fix for examples in the README.
422
+ * User 'jamessp', an observant documentation fix for cursor positioning.
423
+ * User 'vaal1239', Dave Mckee & Lackner Kristof for a tiny but much-needed Win7
424
+ fix.
425
+ * Julien Stuyck, for wisely suggesting Python3 compatible updates to README.
426
+ * Daniel Griffith for multiple fabulous patches.
427
+ * Oscar Lesta for a valuable fix to stop ANSI chars being sent to non-tty
428
+ output.
429
+ * Roger Binns, for many suggestions, valuable feedback, & bug reports.
430
+ * Tim Golden for thought and much appreciated feedback on the initial idea.
431
+ * User 'Zearin' for updates to the README file.
432
+ * John Szakmeister for adding support for light colors
433
+ * Charles Merriam for adding documentation to demos
434
+ * Jurko for a fix on 64-bit Windows CPython2.5 w/o ctypes
435
+ * Florian Bruhin for a fix when stdout or stderr are None
436
+ * Thomas Weininger for fixing ValueError on Windows
437
+ * Remi Rampin for better Github integration and fixes to the README file
438
+ * Simeon Visser for closing a file handle using 'with' and updating classifiers
439
+ to include Python 3.3 and 3.4
440
+ * Andy Neff for fixing RESET of LIGHT_EX colors.
441
+ * Jonathan Hartley for the initial idea and implementation.
env/Lib/site-packages/colorama-0.4.6.dist-info/RECORD ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ colorama-0.4.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
2
+ colorama-0.4.6.dist-info/METADATA,sha256=e67SnrUMOym9sz_4TjF3vxvAV4T3aF7NyqRHHH3YEMw,17158
3
+ colorama-0.4.6.dist-info/RECORD,,
4
+ colorama-0.4.6.dist-info/WHEEL,sha256=cdcF4Fbd0FPtw2EMIOwH-3rSOTUdTCeOSXRMD1iLUb8,105
5
+ colorama-0.4.6.dist-info/licenses/LICENSE.txt,sha256=ysNcAmhuXQSlpxQL-zs25zrtSWZW6JEQLkKIhteTAxg,1491
6
+ colorama/__init__.py,sha256=wePQA4U20tKgYARySLEC047ucNX-g8pRLpYBuiHlLb8,266
7
+ colorama/__pycache__/__init__.cpython-312.pyc,,
8
+ colorama/__pycache__/ansi.cpython-312.pyc,,
9
+ colorama/__pycache__/ansitowin32.cpython-312.pyc,,
10
+ colorama/__pycache__/initialise.cpython-312.pyc,,
11
+ colorama/__pycache__/win32.cpython-312.pyc,,
12
+ colorama/__pycache__/winterm.cpython-312.pyc,,
13
+ colorama/ansi.py,sha256=Top4EeEuaQdBWdteKMEcGOTeKeF19Q-Wo_6_Cj5kOzQ,2522
14
+ colorama/ansitowin32.py,sha256=vPNYa3OZbxjbuFyaVo0Tmhmy1FZ1lKMWCnT7odXpItk,11128
15
+ colorama/initialise.py,sha256=-hIny86ClXo39ixh5iSCfUIa2f_h_bgKRDW7gqs-KLU,3325
16
+ colorama/tests/__init__.py,sha256=MkgPAEzGQd-Rq0w0PZXSX2LadRWhUECcisJY8lSrm4Q,75
17
+ colorama/tests/__pycache__/__init__.cpython-312.pyc,,
18
+ colorama/tests/__pycache__/ansi_test.cpython-312.pyc,,
19
+ colorama/tests/__pycache__/ansitowin32_test.cpython-312.pyc,,
20
+ colorama/tests/__pycache__/initialise_test.cpython-312.pyc,,
21
+ colorama/tests/__pycache__/isatty_test.cpython-312.pyc,,
22
+ colorama/tests/__pycache__/utils.cpython-312.pyc,,
23
+ colorama/tests/__pycache__/winterm_test.cpython-312.pyc,,
24
+ colorama/tests/ansi_test.py,sha256=FeViDrUINIZcr505PAxvU4AjXz1asEiALs9GXMhwRaE,2839
25
+ colorama/tests/ansitowin32_test.py,sha256=RN7AIhMJ5EqDsYaCjVo-o4u8JzDD4ukJbmevWKS70rY,10678
26
+ colorama/tests/initialise_test.py,sha256=BbPy-XfyHwJ6zKozuQOvNvQZzsx9vdb_0bYXn7hsBTc,6741
27
+ colorama/tests/isatty_test.py,sha256=Pg26LRpv0yQDB5Ac-sxgVXG7hsA1NYvapFgApZfYzZg,1866
28
+ colorama/tests/utils.py,sha256=1IIRylG39z5-dzq09R_ngufxyPZxgldNbrxKxUGwGKE,1079
29
+ colorama/tests/winterm_test.py,sha256=qoWFPEjym5gm2RuMwpf3pOis3a5r_PJZFCzK254JL8A,3709
30
+ colorama/win32.py,sha256=YQOKwMTwtGBbsY4dL5HYTvwTeP9wIQra5MvPNddpxZs,6181
31
+ colorama/winterm.py,sha256=XCQFDHjPi6AHYNdZwy0tA02H-Jh48Jp-HvCjeLeLp3U,7134
env/Lib/site-packages/colorama-0.4.6.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.11.1
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any