Hemang Thakur commited on
Commit
ffb491f
·
1 Parent(s): 88b2fda

updated file reading logic

Browse files
Files changed (2) hide show
  1. main.py +49 -12
  2. requirements.txt +1 -0
main.py CHANGED
@@ -6,6 +6,7 @@ import shutil
6
  import asyncio
7
  import logging
8
  import traceback
 
9
  from httpx import AsyncClient, RequestError
10
  from typing import List, Dict, Any, Optional
11
  from fastapi.staticfiles import StaticFiles
@@ -131,19 +132,55 @@ async def process_query(user_query: str, sse_queue: asyncio.Queue):
131
  if filename not in state["user_files_cache"]:
132
  try:
133
  await sse_queue.put(("step", "Reading User-Provided Files..."))
134
- with open(file_path, 'r', encoding='utf-8') as f:
135
- file_content = f.read()
136
- state["user_files_cache"][filename] = file_content
137
- except Exception as e:
138
- logger.error(f"Error reading file {filename}: {e}")
139
- # Try reading as binary and decode
 
 
 
140
  try:
141
- with open(file_path, 'rb') as f:
142
- file_content = f.read().decode('utf-8', errors='ignore')
143
- state["user_files_cache"][filename] = file_content
144
- except Exception as e2:
145
- logger.error(f"Error reading file {filename} as binary: {e2}")
146
- state["user_files_cache"][filename] = "" # Cache empty to avoid retrying
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Add all cached file contents
149
  for filename, content in state["user_files_cache"].items():
 
6
  import asyncio
7
  import logging
8
  import traceback
9
+ from chardet import detect
10
  from httpx import AsyncClient, RequestError
11
  from typing import List, Dict, Any, Optional
12
  from fastapi.staticfiles import StaticFiles
 
132
  if filename not in state["user_files_cache"]:
133
  try:
134
  await sse_queue.put(("step", "Reading User-Provided Files..."))
135
+
136
+ # Always read as binary first
137
+ with open(file_path, 'rb') as f:
138
+ file_bytes = f.read()
139
+
140
+ # Try to decode with multiple strategies
141
+ file_content = None
142
+
143
+ # Strategy 1: Try UTF-8 with BOM handling
144
  try:
145
+ # Handle UTF-8 BOM if present
146
+ if file_bytes.startswith(b'\xef\xbb\xbf'):
147
+ file_content = file_bytes[3:].decode('utf-8')
148
+ else:
149
+ file_content = file_bytes.decode('utf-8')
150
+ logger.info(f"Successfully decoded {filename} as UTF-8")
151
+ except UnicodeDecodeError:
152
+ # Strategy 2: Try other common encodings
153
+ for encoding in ['utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1', 'windows-1252']:
154
+ try:
155
+ file_content = file_bytes.decode(encoding)
156
+ logger.info(f"Successfully decoded {filename} with {encoding}")
157
+ break
158
+ except UnicodeDecodeError:
159
+ continue
160
+
161
+ # Strategy 3: If all else fails, use chardet for detection
162
+ if file_content is None:
163
+ try:
164
+
165
+ detected = detect(file_bytes)
166
+ if detected['encoding']:
167
+ file_content = file_bytes.decode(detected['encoding'])
168
+ logger.info(f"Decoded {filename} with detected encoding: {detected['encoding']}")
169
+ except:
170
+ pass
171
+
172
+ # Final fallback: Use UTF-8 with replacement
173
+ if file_content is None:
174
+ file_content = file_bytes.decode('utf-8', errors='replace')
175
+ logger.warning(f"Had to use error replacement for {filename}")
176
+
177
+ # Store the decoded content
178
+ state["user_files_cache"][filename] = file_content
179
+ logger.info(f"Successfully cached file {filename}, length: {len(file_content)} chars")
180
+
181
+ except Exception as e:
182
+ logger.error(f"Error reading file {filename}: {str(e)}")
183
+ state["user_files_cache"][filename] = "" # Cache empty to avoid retrying
184
 
185
  # Add all cached file contents
186
  for filename, content in state["user_files_cache"].items():
requirements.txt CHANGED
@@ -3,6 +3,7 @@ aiohttp==3.10.10
3
  anthropic==0.42.0
4
  beautifulsoup4==4.12.3
5
  bert_score==0.3.13
 
6
  crawl4ai[all]==0.3.731
7
  deepeval==2.0
8
  fake_useragent==1.5.1
 
3
  anthropic==0.42.0
4
  beautifulsoup4==4.12.3
5
  bert_score==0.3.13
6
+ chardet>=5.0.0
7
  crawl4ai[all]==0.3.731
8
  deepeval==2.0
9
  fake_useragent==1.5.1