# document_processor.py from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import OllamaEmbeddings from config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, OLLAMA_BASE_URL class DocumentProcessor: def __init__(self): self.embedder = OllamaEmbeddings( model="nomic-embed-text", base_url=OLLAMA_BASE_URL # 指定Ollama服务地址 ) self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, separators=["\n\n", "\n", " ", ""] ) def split_documents(self, documents): """分割文档为小块""" texts = [] for doc in documents: splits = self.text_splitter.split_text(doc['content']) for i, split in enumerate(splits): texts.append({ 'content': split, 'source': doc.get('source', 'unknown'), 'chunk': i }) return texts def generate_embeddings(self, texts): """为文本生成嵌入向量""" contents = [text['content'] for text in texts] # 使用 OllamaEmbeddings 的 embed_documents 方法 embeddings = self.embedder.embed_documents(contents) return embeddings