Spaces:

chen666-666
/

wechat-ner-re

Running

App Files Files Community

chen666-666 commited on Apr 15

Commit

1e5ba7c

1 Parent(s): b8c346d

更新代码：添加新的功能

Browse files

Files changed (7) hide show

.idea/.gitignore +3 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.idea/wechat-ner-re.iml +8 -0
app.py +38 -67

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# 默认忽略的文件
+/shelf/
+/workspace.xml

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/wechat-ner-re.iml" filepath="$PROJECT_DIR$/.idea/wechat-ner-re.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

.idea/wechat-ner-re.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import BertTokenizer, BertModel
 import gradio as gr
 import re
 import os
@@ -8,11 +8,16 @@ import pandas as pd
 import chardet
 from pyvis.network import Network
 import networkx as nx
-# 初始化模型
-model_name = "bert-base-chinese"
-tokenizer = BertTokenizer.from_pretrained(model_name)
-model = BertModel.from_pretrained(model_name)
 # 知识图谱数据存储
 knowledge_graph = {
@@ -56,7 +61,7 @@ def visualize_kg():
                              font={'size': 14})
                 seen_edges.add(edge_key)
-    net.set_options("""
     {
       "nodes": {
         "scaling": {
@@ -88,7 +93,16 @@ def visualize_kg():
     return f'<div class="kg-graph">{html}</div>'
-def ner(text):
     name_pattern = r"([赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜][\u4e00-\u9fa5]{1,2})(?![的地得啦啊呀])"
     id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})(?![\u4e00-\u9fa5])"
@@ -120,60 +134,13 @@ def ner(text):
             })
             occupied.add((start, end))
-    return sorted(entities, key=lambda x: x["start"])
-def re_extract(entities, text):
-    relations = []
-    triggers = {
-        "recommend": ["推荐", "引荐", "建议", "找"],
-        "send_to": ["发送", "转发", "发给", "抄送"],
-        "mention": ["提到", "提及", "@", "说"]
-    }
-    for i in range(len(entities)):
-        for j in range(max(0, i - 2), min(len(entities), i + 3)):
-            if i == j:
-                continue
-            ctx_start = entities[i]["end"]
-            ctx_end = entities[j]["start"]
-            context = text[ctx_start:ctx_end].strip()
-            if text.startswith('@', entities[i]["start"] - 1):
-                relations.append({
-                    "head": entities[i]["text"],
-                    "tail": entities[j]["text"],
-                    "relation": "mention"
-                })
-                continue
-            relation_type = "knows"
-            for rel_type, keywords in triggers.items():
-                if any(kw in context for kw in keywords):
-                    relation_type = rel_type
-                    break
-            relations.append({
-                "head": entities[i]["text"],
-                "tail": entities[j]["text"],
-                "relation": relation_type
-            })
-    unique_relations = []
-    seen = set()
-    for rel in relations:
-        key = (rel["head"], rel["tail"], rel["relation"])
-        if key not in seen:
-            unique_relations.append(rel)
-            seen.add(key)
-    return unique_relations
-def process_text(text):
     try:
-        entities = ner(text)
         relations = re_extract(entities, text)
         update_knowledge_graph(entities, relations)
@@ -187,10 +154,10 @@ def process_text(text):
         )
         kg_html = visualize_kg()
-        return entity_output, relation_output, gr.HTML(kg_html)
     except Exception as e:
-        return f"处理出错: {str(e)}", "", gr.HTML()
 def detect_encoding(file_path):
@@ -198,7 +165,7 @@ def detect_encoding(file_path):
         return chardet.detect(f.read(4096))['encoding'] or 'utf-8'
-def process_file(file):
     ext = os.path.splitext(file.name)[-1].lower()
     full_text = ""
@@ -238,10 +205,10 @@ def process_file(file):
         else:
             return f"❌ 不支持的文件类型: {ext}", "", gr.HTML()
-        return process_text(full_text)
     except Exception as e:
-        return f"❌ 文件处理错误: {str(e)}", "", gr.HTML()
 # Gradio UI
@@ -267,12 +234,14 @@ with gr.Blocks(css=css) as demo:
         gr.Markdown("### 直接输入聊天内容")
         input_text = gr.Textbox(label="输入内容", lines=8,
                                 placeholder="示例：张三@李四 请把需求文档_v2发送给王五")
         analyze_btn = gr.Button("开始分析", variant="primary")
         with gr.Row():
             entity_output = gr.Textbox(label="识别实体", interactive=False)
             relation_output = gr.Textbox(label="发现关系", interactive=False)
         kg_display = gr.HTML(label="知识图谱", elem_classes="kg-container")
         analyze_btn.click(
             fn=lambda: gr.update(interactive=False),
@@ -280,8 +249,8 @@ with gr.Blocks(css=css) as demo:
             outputs=analyze_btn
         ).then(
             fn=process_text,
-            inputs=[input_text],
-            outputs=[entity_output, relation_output, kg_display]
         ).then(
             fn=lambda: gr.update(interactive=True),
             inputs=None,
@@ -291,17 +260,19 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab("📁 文件分析"):
         gr.Markdown("### 上传聊天记录文件")
         file_input = gr.File(label="选择文件", file_types=[".txt", ".json", ".jsonl", ".csv"])
         file_btn = gr.Button("分析文件", variant="primary")
         with gr.Row():
             file_entity = gr.Textbox(label="识别实体", interactive=False)
             file_relation = gr.Textbox(label="发现关系", interactive=False)
         file_kg = gr.HTML(elem_classes="kg-container")
         file_btn.click(
             fn=process_file,
-            inputs=[file_input],
-            outputs=[file_entity, file_relation, file_kg]
         )
     with gr.Tab("🗺️ 完整图谱"):

 import torch
+from transformers import BertTokenizer, BertModel, LlamaTokenizer, LlamaForCausalLM
 import gradio as gr
 import re
 import os
 import chardet
 from pyvis.network import Network
 import networkx as nx
+import time
+# 初始化 BERT 和 LLaMA 2 模型
+bert_model_name = "bert-base-chinese"
+bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
+bert_model = BertModel.from_pretrained(bert_model_name)
+llama_model_name = "meta-llama/Llama-2-7b-chat-hf"
+llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model_name)
+llama_model = LlamaForCausalLM.from_pretrained(llama_model_name)
 # 知识图谱数据存储
 knowledge_graph = {
                              font={'size': 14})
                 seen_edges.add(edge_key)
+    net.set_options("""
     {
       "nodes": {
         "scaling": {
     return f'<div class="kg-graph">{html}</div>'
+def ner(text, model_type="bert"):
+    # 选择模型进行处理
+    start_time = time.time()
+    if model_type == "bert":
+        tokenizer = bert_tokenizer
+        model = bert_model
+    elif model_type == "llama":
+        tokenizer = llama_tokenizer
+        model = llama_model
     name_pattern = r"([赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜][\u4e00-\u9fa5]{1,2})(?![的地得啦啊呀])"
     id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})(?![\u4e00-\u9fa5])"
             })
             occupied.add((start, end))
+    processing_time = time.time() - start_time
+    return entities, processing_time
+def process_text(text, model_type="bert"):
     try:
+        entities, processing_time = ner(text, model_type=model_type)
         relations = re_extract(entities, text)
         update_knowledge_graph(entities, relations)
         )
         kg_html = visualize_kg()
+        return entity_output, relation_output, gr.HTML(kg_html), f"处理时间：{processing_time:.2f}秒"
     except Exception as e:
+        return f"处理出错: {str(e)}", "", gr.HTML(), ""
 def detect_encoding(file_path):
         return chardet.detect(f.read(4096))['encoding'] or 'utf-8'
+def process_file(file, model_type="bert"):
     ext = os.path.splitext(file.name)[-1].lower()
     full_text = ""
         else:
             return f"❌ 不支持的文件类型: {ext}", "", gr.HTML()
+        return process_text(full_text, model_type)
     except Exception as e:
+        return f"❌ 文件处理错误: {str(e)}", "", gr.HTML(), ""
 # Gradio UI
         gr.Markdown("### 直接输入聊天内容")
         input_text = gr.Textbox(label="输入内容", lines=8,
                                 placeholder="示例：张三@李四 请把需求文档_v2发送给王五")
+        model_type = gr.Radio(["bert", "llama"], label="选择模型", value="bert")
         analyze_btn = gr.Button("开始分析", variant="primary")
         with gr.Row():
             entity_output = gr.Textbox(label="识别实体", interactive=False)
             relation_output = gr.Textbox(label="发现关系", interactive=False)
         kg_display = gr.HTML(label="知识图谱", elem_classes="kg-container")
+        time_output = gr.Textbox(label="处理时间", interactive=False)
         analyze_btn.click(
             fn=lambda: gr.update(interactive=False),
             outputs=analyze_btn
         ).then(
             fn=process_text,
+            inputs=[input_text, model_type],
+            outputs=[entity_output, relation_output, kg_display, time_output]
         ).then(
             fn=lambda: gr.update(interactive=True),
             inputs=None,
     with gr.Tab("📁 文件分析"):
         gr.Markdown("### 上传聊天记录文件")
         file_input = gr.File(label="选择文件", file_types=[".txt", ".json", ".jsonl", ".csv"])
+        file_model_type = gr.Radio(["bert", "llama"], label="选择模型", value="bert")
         file_btn = gr.Button("分析文件", variant="primary")
         with gr.Row():
             file_entity = gr.Textbox(label="识别实体", interactive=False)
             file_relation = gr.Textbox(label="发现关系", interactive=False)
         file_kg = gr.HTML(elem_classes="kg-container")
+        file_time = gr.Textbox(label="处理时间", interactive=False)
         file_btn.click(
             fn=process_file,
+            inputs=[file_input, file_model_type],
+            outputs=[file_entity, file_relation, file_kg, file_time]
         )
     with gr.Tab("🗺️ 完整图谱"):