37 lines
1.3 KiB
Python
37 lines
1.3 KiB
Python
# document_processor.py
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_community.embeddings import OllamaEmbeddings
|
|
from config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, OLLAMA_BASE_URL
|
|
|
|
|
|
class DocumentProcessor:
|
|
def __init__(self):
|
|
self.embedder = OllamaEmbeddings(
|
|
model="nomic-embed-text",
|
|
base_url=OLLAMA_BASE_URL # 指定Ollama服务地址
|
|
)
|
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=CHUNK_SIZE,
|
|
chunk_overlap=CHUNK_OVERLAP,
|
|
separators=["\n\n", "\n", " ", ""]
|
|
)
|
|
def split_documents(self, documents):
|
|
"""分割文档为小块"""
|
|
texts = []
|
|
for doc in documents:
|
|
splits = self.text_splitter.split_text(doc['content'])
|
|
for i, split in enumerate(splits):
|
|
texts.append({
|
|
'content': split,
|
|
'source': doc.get('source', 'unknown'),
|
|
'chunk': i
|
|
})
|
|
return texts
|
|
|
|
def generate_embeddings(self, texts):
|
|
"""为文本生成嵌入向量"""
|
|
contents = [text['content'] for text in texts]
|
|
# 使用 OllamaEmbeddings 的 embed_documents 方法
|
|
embeddings = self.embedder.embed_documents(contents)
|
|
return embeddings
|