Hemang Thakur
commited on
Commit
·
ffb491f
1
Parent(s):
88b2fda
updated file reading logic
Browse files- main.py +49 -12
- requirements.txt +1 -0
main.py
CHANGED
@@ -6,6 +6,7 @@ import shutil
|
|
6 |
import asyncio
|
7 |
import logging
|
8 |
import traceback
|
|
|
9 |
from httpx import AsyncClient, RequestError
|
10 |
from typing import List, Dict, Any, Optional
|
11 |
from fastapi.staticfiles import StaticFiles
|
@@ -131,19 +132,55 @@ async def process_query(user_query: str, sse_queue: asyncio.Queue):
|
|
131 |
if filename not in state["user_files_cache"]:
|
132 |
try:
|
133 |
await sse_queue.put(("step", "Reading User-Provided Files..."))
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
# Try
|
|
|
|
|
|
|
140 |
try:
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# Add all cached file contents
|
149 |
for filename, content in state["user_files_cache"].items():
|
|
|
6 |
import asyncio
|
7 |
import logging
|
8 |
import traceback
|
9 |
+
from chardet import detect
|
10 |
from httpx import AsyncClient, RequestError
|
11 |
from typing import List, Dict, Any, Optional
|
12 |
from fastapi.staticfiles import StaticFiles
|
|
|
132 |
if filename not in state["user_files_cache"]:
|
133 |
try:
|
134 |
await sse_queue.put(("step", "Reading User-Provided Files..."))
|
135 |
+
|
136 |
+
# Always read as binary first
|
137 |
+
with open(file_path, 'rb') as f:
|
138 |
+
file_bytes = f.read()
|
139 |
+
|
140 |
+
# Try to decode with multiple strategies
|
141 |
+
file_content = None
|
142 |
+
|
143 |
+
# Strategy 1: Try UTF-8 with BOM handling
|
144 |
try:
|
145 |
+
# Handle UTF-8 BOM if present
|
146 |
+
if file_bytes.startswith(b'\xef\xbb\xbf'):
|
147 |
+
file_content = file_bytes[3:].decode('utf-8')
|
148 |
+
else:
|
149 |
+
file_content = file_bytes.decode('utf-8')
|
150 |
+
logger.info(f"Successfully decoded {filename} as UTF-8")
|
151 |
+
except UnicodeDecodeError:
|
152 |
+
# Strategy 2: Try other common encodings
|
153 |
+
for encoding in ['utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1', 'windows-1252']:
|
154 |
+
try:
|
155 |
+
file_content = file_bytes.decode(encoding)
|
156 |
+
logger.info(f"Successfully decoded {filename} with {encoding}")
|
157 |
+
break
|
158 |
+
except UnicodeDecodeError:
|
159 |
+
continue
|
160 |
+
|
161 |
+
# Strategy 3: If all else fails, use chardet for detection
|
162 |
+
if file_content is None:
|
163 |
+
try:
|
164 |
+
|
165 |
+
detected = detect(file_bytes)
|
166 |
+
if detected['encoding']:
|
167 |
+
file_content = file_bytes.decode(detected['encoding'])
|
168 |
+
logger.info(f"Decoded {filename} with detected encoding: {detected['encoding']}")
|
169 |
+
except:
|
170 |
+
pass
|
171 |
+
|
172 |
+
# Final fallback: Use UTF-8 with replacement
|
173 |
+
if file_content is None:
|
174 |
+
file_content = file_bytes.decode('utf-8', errors='replace')
|
175 |
+
logger.warning(f"Had to use error replacement for {filename}")
|
176 |
+
|
177 |
+
# Store the decoded content
|
178 |
+
state["user_files_cache"][filename] = file_content
|
179 |
+
logger.info(f"Successfully cached file {filename}, length: {len(file_content)} chars")
|
180 |
+
|
181 |
+
except Exception as e:
|
182 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
183 |
+
state["user_files_cache"][filename] = "" # Cache empty to avoid retrying
|
184 |
|
185 |
# Add all cached file contents
|
186 |
for filename, content in state["user_files_cache"].items():
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ aiohttp==3.10.10
|
|
3 |
anthropic==0.42.0
|
4 |
beautifulsoup4==4.12.3
|
5 |
bert_score==0.3.13
|
|
|
6 |
crawl4ai[all]==0.3.731
|
7 |
deepeval==2.0
|
8 |
fake_useragent==1.5.1
|
|
|
3 |
anthropic==0.42.0
|
4 |
beautifulsoup4==4.12.3
|
5 |
bert_score==0.3.13
|
6 |
+
chardet>=5.0.0
|
7 |
crawl4ai[all]==0.3.731
|
8 |
deepeval==2.0
|
9 |
fake_useragent==1.5.1
|