Files
AI-Search/org/document_processor.py
2025-12-15 14:38:31 +08:00

37 lines
1.3 KiB
Python

# document_processor.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, OLLAMA_BASE_URL
class DocumentProcessor:
def __init__(self):
self.embedder = OllamaEmbeddings(
model="nomic-embed-text",
base_url=OLLAMA_BASE_URL # 指定Ollama服务地址
)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", " ", ""]
)
def split_documents(self, documents):
"""分割文档为小块"""
texts = []
for doc in documents:
splits = self.text_splitter.split_text(doc['content'])
for i, split in enumerate(splits):
texts.append({
'content': split,
'source': doc.get('source', 'unknown'),
'chunk': i
})
return texts
def generate_embeddings(self, texts):
"""为文本生成嵌入向量"""
contents = [text['content'] for text in texts]
# 使用 OllamaEmbeddings 的 embed_documents 方法
embeddings = self.embedder.embed_documents(contents)
return embeddings