1
This commit is contained in:
36
org/document_processor.py
Normal file
36
org/document_processor.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# document_processor.py
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.embeddings import OllamaEmbeddings
|
||||
from config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, OLLAMA_BASE_URL
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(self):
|
||||
self.embedder = OllamaEmbeddings(
|
||||
model="nomic-embed-text",
|
||||
base_url=OLLAMA_BASE_URL # 指定Ollama服务地址
|
||||
)
|
||||
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=CHUNK_OVERLAP,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
def split_documents(self, documents):
|
||||
"""分割文档为小块"""
|
||||
texts = []
|
||||
for doc in documents:
|
||||
splits = self.text_splitter.split_text(doc['content'])
|
||||
for i, split in enumerate(splits):
|
||||
texts.append({
|
||||
'content': split,
|
||||
'source': doc.get('source', 'unknown'),
|
||||
'chunk': i
|
||||
})
|
||||
return texts
|
||||
|
||||
def generate_embeddings(self, texts):
|
||||
"""为文本生成嵌入向量"""
|
||||
contents = [text['content'] for text in texts]
|
||||
# 使用 OllamaEmbeddings 的 embed_documents 方法
|
||||
embeddings = self.embedder.embed_documents(contents)
|
||||
return embeddings
|
||||
Reference in New Issue
Block a user