Spaces:

gewei20
/

smart-web-crawler

Running

App Files Files Community

gewei20 commited on 13 days ago

Commit

671d8e3

verified ·

1 Parent(s): 67add1d

Create md_knowledge_base_v1.py

Browse files

Files changed (1) hide show

md_knowledge_base_v1.py +272 -0

md_knowledge_base_v1.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# md_knowledge_base_v1.py
+import os
+import json
+import requests
+import hashlib
+from pathlib import Path
+from typing import List, Dict, Optional
+import time
+from datetime import datetime
+class MarkdownKnowledgeBase:
+    def __init__(self, api_token: str, base_url: str = "https://api.siliconflow.cn/v1"):
+        """
+        初始化知识库构建器
+        Args:
+            api_token: SiliconFlow API token
+            base_url: API 基础URL
+        """
+        self.api_token = api_token
+        self.base_url = base_url
+        self.headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json"
+        }
+        self.knowledge_base = []
+    def scan_markdown_files(self, folder_path: str) -> List[str]:
+        # ... (此函数未改变)
+        md_files = []
+        folder = Path(folder_path)
+        if not folder.exists():
+            raise FileNotFoundError(f"文件夹不存在: {folder_path}")
+        try:
+            for md_file in folder.rglob("*.md"):
+                if md_file.is_file():
+                    file_path = str(md_file.resolve())
+                    try:
+                        if os.path.exists(file_path) and os.path.isfile(file_path):
+                            md_files.append(file_path)
+                        else:
+                            print(f"跳过无法访问的文件: {file_path}")
+                    except Exception as e:
+                        print(f"跳过问题文件: {md_file} - {e}")
+                        continue
+        except Exception as e:
+            print(f"扫描文件夹时出错: {e}")
+        print(f"找到 {len(md_files)} 个可访问的 Markdown 文件")
+        return md_files
+    def read_markdown_content(self, file_path: str) -> Dict:
+        # ... (此函数未改变)
+        try:
+            file_path = os.path.normpath(file_path)
+            if not os.path.exists(file_path):
+                print(f"文件不存在: {file_path}")
+                return None
+            encodings = ['utf-8', 'utf-8-sig', 'gbk', 'cp1252', 'latin1']
+            content = None
+            used_encoding = None
+            for encoding in encodings:
+                try:
+                    with open(file_path, 'r', encoding=encoding) as file:
+                        content = file.read()
+                        used_encoding = encoding
+                        break
+                except UnicodeDecodeError:
+                    continue
+                except Exception as e:
+                    print(f"编码 {encoding} 读取失败: {e}")
+                    continue
+            if content is None:
+                print(f"无法读取文件 {file_path}: 所有编码都失败")
+                return None
+            file_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
+            return {
+                'file_path': file_path,
+                'file_name': os.path.basename(file_path),
+                'content': content,
+                'hash': file_hash,
+                'size': len(content),
+                'encoding': used_encoding,
+                'modified_time': datetime.fromtimestamp(os.path.getmtime(file_path)).isoformat()
+            }
+        except Exception as e:
+            print(f"读取文件失败 {file_path}: {e}")
+            return None
+    def chunk_text(self, text: str, chunk_size: int = 4096, overlap: int = 400) -> List[str]:
+        # ... (默认参数已更新以匹配bge-m3)
+        if len(text) <= chunk_size:
+            return [text]
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + chunk_size
+            if end < len(text):
+                for separator in ['\n\n', '。', '\n', ' ']:
+                    split_pos = text.rfind(separator, start, end)
+                    if split_pos > start:
+                        end = split_pos + len(separator)
+                        break
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            start = max(start + 1, end - overlap)
+        return chunks
+    def get_embeddings(self, texts: List[str], model: str = "BAAI/bge-m3") -> List[List[float]]:
+        """
+        获取文本向量
+        Args:
+            texts: 文本列表
+            model: 嵌入模型名称 - **已更新为 bge-m3**
+        Returns:
+            向量列表
+        """
+        url = f"{self.base_url}/embeddings"
+        embeddings = []
+        # **优化**: 增加批处理大小以提高效率，并减少等待时间
+        batch_size = 32
+        total_batches = (len(texts) + batch_size - 1) // batch_size
+        print(f"开始处理 {len(texts)} 个文本块，分为 {total_batches} 批")
+        for batch_idx in range(0, len(texts), batch_size):
+            batch = texts[batch_idx:batch_idx + batch_size]
+            current_batch = batch_idx // batch_size + 1
+            print(f"处理批次 {current_batch}/{total_batches} ({len(batch)} 个文本)")
+            payload = {"model": model, "input": batch, "encoding_format": "float"}
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    response = requests.post(url, json=payload, headers=self.headers, timeout=60) # 增加超时
+                    response.raise_for_status()
+                    result = response.json()
+                    if 'data' in result:
+                        batch_embeddings = [item['embedding'] for item in result['data']]
+                        embeddings.extend(batch_embeddings)
+                        print(f"  ✓ 成功获取 {len(batch_embeddings)} 个向量")
+                        break
+                    else:
+                        print(f"  ✗ API 返回格式异常: {result}")
+                        embeddings.extend([[] for _ in batch])
+                        break
+                except requests.exceptions.RequestException as e:
+                    print(f"  ✗ 请求失败 (尝试 {attempt + 1}/{max_retries}): {e}")
+                    if attempt == max_retries - 1:
+                        embeddings.extend([[] for _ in batch])
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+            # **优化**: 缩短请求间隔
+            time.sleep(0.1)
+        print(f"向量生成完成: {len([e for e in embeddings if e])} 成功, {len([e for e in embeddings if not e])} 失败")
+        return embeddings
+    def rerank_documents(self, query: str, documents: List[str],
+                        model: str = "BAAI/bge-reranker-v2-m3",
+                        top_n: int = 10) -> Dict:
+        """
+        对文档进行重排 - **已更新为 bge-reranker-v2-m3**
+        """
+        url = f"{self.base_url}/rerank"
+        payload = {
+            "model": model, "query": query, "documents": documents,
+            "top_n": min(top_n, len(documents)), "return_documents": True
+        }
+        try:
+            response = requests.post(url, json=payload, headers=self.headers)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            print(f"重排失败: {e}")
+            return {"results": []}
+    def build_knowledge_base(self, folder_path: str, chunk_size: int = 4096, overlap: int = 400,
+                           max_files: int = None, sample_mode: str = "random"):
+        # ... (此函数未改变逻辑, 但默认参数已更新)
+        print("开始构建知识库...")
+        md_files = self.scan_markdown_files(folder_path)
+        if not md_files:
+            print("没有找到可处理的 Markdown 文件")
+            return
+        if max_files and len(md_files) > max_files:
+            print(f"文件数量过多({len(md_files)})，采用{sample_mode}策略选择{max_files}个文件")
+            if sample_mode == "random":
+                import random
+                md_files = random.sample(md_files, max_files)
+            elif sample_mode == "largest":
+                file_sizes = sorted([(fp, os.path.getsize(fp)) for fp in md_files], key=lambda x: x[1], reverse=True)
+                md_files = [fp for fp, _ in file_sizes[:max_files]]
+            elif sample_mode == "recent":
+                file_times = sorted([(fp, os.path.getmtime(fp)) for fp in md_files], key=lambda x: x[1], reverse=True)
+                md_files = [fp for fp, _ in file_times[:max_files]]
+        print(f"将处理 {len(md_files)} 个文件")
+        all_chunks, chunk_metadata = [], []
+        processed_files, skipped_files = 0, 0
+        for i, file_path in enumerate(md_files, 1):
+            print(f"处理文件 {i}/{len(md_files)}: {os.path.basename(file_path)}")
+            file_info = self.read_markdown_content(file_path)
+            if not file_info or len(file_info['content'].strip()) < 50:
+                skipped_files += 1
+                continue
+            chunks = self.chunk_text(file_info['content'], chunk_size, overlap)
+            processed_files += 1
+            for j, chunk in enumerate(chunks):
+                if len(chunk.strip()) > 20:
+                    all_chunks.append(chunk)
+                    chunk_metadata.append({'file_path': file_info['file_path'], 'file_name': file_info['file_name'], 'chunk_index': j, 'chunk_count': len(chunks), 'file_hash': file_info['hash']})
+        print(f"成功处理 {processed_files} 个文件，跳过 {skipped_files} 个文件")
+        print(f"总共生成 {len(all_chunks)} 个文本块")
+        if not all_chunks:
+            print("没有有效的文本块，知识库构建失败")
+            return
+        print("开始生成向量...")
+        embeddings = self.get_embeddings(all_chunks)
+        self.knowledge_base = []
+        valid_embeddings = 0
+        for i, (chunk, embedding, metadata) in enumerate(zip(all_chunks, embeddings, chunk_metadata)):
+            if embedding:
+                self.knowledge_base.append({'id': len(self.knowledge_base), 'content': chunk, 'embedding': embedding, 'metadata': metadata})
+                valid_embeddings += 1
+        print(f"知识库构建完成! 有效向量: {valid_embeddings}, 总条目: {len(self.knowledge_base)}")
+    def search(self, query: str, top_k: int = 5, use_rerank: bool = True) -> List[Dict]:
+        # ... (此函数未改变)
+        if not self.knowledge_base: return []
+        query_embedding = self.get_embeddings([query])[0]
+        if not query_embedding: return []
+        import numpy as np
+        query_embedding_norm = np.linalg.norm(query_embedding)
+        if query_embedding_norm == 0: return []
+        similarities = []
+        for item in self.knowledge_base:
+            if not item['embedding']:
+                similarities.append(0)
+                continue
+            item_embedding_norm = np.linalg.norm(item['embedding'])
+            if item_embedding_norm == 0:
+                similarities.append(0)
+            else:
+                similarity = np.dot(query_embedding, item['embedding']) / (query_embedding_norm * item_embedding_norm)
+                similarities.append(similarity)
+        top_results_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:min(top_k * 3, len(similarities))]
+        if use_rerank and len(top_results_indices) > 1:
+            documents_to_rerank = [self.knowledge_base[i]['content'] for i in top_results_indices]
+            rerank_result = self.rerank_documents(query, documents_to_rerank, top_n=top_k)
+            if rerank_result.get('results'):
+                final_results = []
+                for res in rerank_result['results']:
+                    original_index = top_results_indices[res['index']]
+                    item = self.knowledge_base[original_index].copy()
+                    item['relevance_score'] = res['relevance_score']
+                    final_results.append(item)
+                return final_results[:top_k]
+        return [self.knowledge_base[i] for i in top_results_indices[:top_k]]
+    def save_knowledge_base(self, output_path: str):
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(self.knowledge_base, f, ensure_ascii=False, indent=2)
+        print(f"知识库已保存到: {output_path}")
+    def load_knowledge_base(self, input_path: str):
+        with open(input_path, 'r', encoding='utf-8') as f:
+            self.knowledge_base = json.load(f)
+        print(f"知识库已从 {input_path} 加载，包含 {len(self.knowledge_base)} 个条目")