Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # -*- coding:utf-8 _*- | |
| """ | |
| @author:quincy qiang | |
| @license: Apache Licence | |
| @file: create_knowledge.py | |
| @time: 2023/04/18 | |
| @contact: [email protected] | |
| @software: PyCharm | |
| @description: - emoji:https://emojixd.com/pocket/science | |
| """ | |
| import os | |
| import pandas as pd | |
| from langchain.schema import Document | |
| from langchain.document_loaders import UnstructuredFileLoader | |
| from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from tqdm import tqdm | |
| # 中文Wikipedia数据导入示例: | |
| embedding_model_name = '/root/pretrained_models/text2vec-large-chinese' | |
| docs_path = '/root/GoMall/Knowledge-ChatGLM/cache/financial_research_reports' | |
| embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) | |
| # Wikipedia数据处理 | |
| # docs = [] | |
| # with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f: | |
| # for idx, line in tqdm(enumerate(f.readlines())): | |
| # metadata = {"source": f'doc_id_{idx}'} | |
| # docs.append(Document(page_content=line.strip(), metadata=metadata)) | |
| # | |
| # vector_store = FAISS.from_documents(docs, embeddings) | |
| # vector_store.save_local('cache/zh_wikipedia/') | |
| docs = [] | |
| with open('cache/zh_wikipedia/wiki.zh-sim-cleaned.txt', 'r', encoding='utf-8') as f: | |
| for idx, line in tqdm(enumerate(f.readlines())): | |
| metadata = {"source": f'doc_id_{idx}'} | |
| docs.append(Document(page_content=line.strip(), metadata=metadata)) | |
| vector_store = FAISS.from_documents(docs, embeddings) | |
| vector_store.save_local('cache/zh_wikipedia/') | |
| # 金融研报数据处理 | |
| # docs = [] | |
| # | |
| # for doc in tqdm(os.listdir(docs_path)): | |
| # if doc.endswith('.txt'): | |
| # # print(doc) | |
| # loader = UnstructuredFileLoader(f'{docs_path}/{doc}', mode="elements") | |
| # doc = loader.load() | |
| # docs.extend(doc) | |
| # vector_store = FAISS.from_documents(docs, embeddings) | |
| # vector_store.save_local('cache/financial_research_reports') | |
| # 英雄联盟 | |
| docs = [] | |
| lol_df = pd.read_csv('cache/lol/champions.csv') | |
| # lol_df.columns = ['id', '英雄简称', '英雄全称', '出生地', '人物属性', '英雄类别', '英雄故事'] | |
| print(lol_df) | |
| for idx, row in lol_df.iterrows(): | |
| metadata = {"source": f'doc_id_{idx}'} | |
| text = ' '.join(row.values) | |
| # for col in ['英雄简称', '英雄全称', '出生地', '人物属性', '英雄类别', '英雄故事']: | |
| # text += row[col] | |
| docs.append(Document(page_content=text, metadata=metadata)) | |
| vector_store = FAISS.from_documents(docs, embeddings) | |
| vector_store.save_local('cache/lol/') | |