add rag test

2026-06-12 10:57:45 +07:00
parent 5c5e3333a5
commit 3ebf6f450d
6 changed files with 1251 additions and 0 deletions
@@ -0,0 +1,14 @@
 # LLM API Configuration
 # Get your API key from:
 # - OpenAI: https://platform.openai.com/api-keys
 # - Together.ai: https://api.together.xyz/settings/api-keys
 # - Groq: https://console.groq.com/keys
 # - Or any OpenAI-compatible API
 OPENAI_API_KEY=your_api_key_here
 # Optional: Custom base URL for OpenAI-compatible APIs
 # LLM_BASE_URL=https://api.openai.com/v1
 # Optional: Model name (default: gpt-4o-mini)
 # LLM_MODEL=gpt-4o-mini
@@ -0,0 +1,10 @@
 # LLM API Configuration
 # Get your API key from https://platform.openai.com/api-keys
 # Or use any OpenAI-compatible API (e.g., Together.ai, Groq, etc.)
 OPENAI_API_KEY=your_api_key_here
 # Optional: Custom base URL for OpenAI-compatible APIs
 # LLM_BASE_URL=https://api.openai.com/v1
 # Optional: Model name (default: gpt-4o-mini)
 # LLM_MODEL=gpt-4o-mini
@@ -0,0 +1,320 @@
 #!/usr/bin/env python3
 """
 OrangePi RAG Application
 A Vietnamese-language RAG system for querying Orange Pi blog articles.
 """
 import os
 import json
 import argparse
 import sys
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
 import requests
 from dotenv import load_dotenv
 from tqdm import tqdm
@dataclass
 class Chunk:
    chunk_id: str
    article_id: str
    content: str
    section: Optional[str]
    title: str
    url: str
    language: str
    metadata: Dict[str, Any]
    embedding: Optional[np.ndarray] = None
 class VectorStore:
    def __init__(self, dim: int):
        self.dim = dim
        self.index = faiss.IndexFlatIP(dim)
        self.chunks: List[Chunk] = []
    def add(self, chunks: List[Chunk], embeddings: np.ndarray):
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings.astype(np.float32))
        self.chunks.extend(chunks)
    def search(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Chunk]:
        faiss.normalize_L2(query_embedding.reshape(1, -1))
        scores, indices = self.index.search(query_embedding.reshape(1, -1).astype(np.float32), top_k)
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx >= 0 and idx < len(self.chunks):
                chunk = self.chunks[idx]
                chunk.metadata["similarity_score"] = float(score)
                results.append(chunk)
        return results
    def save(self, path: Path):
        faiss.write_index(self.index, str(path / "faiss.index"))
        with open(path / "chunks.jsonl", "w", encoding="utf-8") as f:
            for chunk in self.chunks:
                data = {
                    "chunk_id": chunk.chunk_id,
                    "article_id": chunk.article_id,
                    "content": chunk.content,
                    "section": chunk.section,
                    "title": chunk.title,
                    "url": chunk.url,
                    "language": chunk.language,
                    "metadata": chunk.metadata,
                }
                f.write(json.dumps(data, ensure_ascii=False) + "\n")
    @classmethod
    def load(cls, path: Path, dim: int) -> "VectorStore":
        store = cls(dim)
        store.index = faiss.read_index(str(path / "faiss.index"))
        with open(path / "chunks.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                store.chunks.append(Chunk(
                    chunk_id=data["chunk_id"],
                    article_id=data["article_id"],
                    content=data["content"],
                    section=data.get("section"),
                    title=data["title"],
                    url=data["url"],
                    language=data["language"],
                    metadata=data["metadata"],
                ))
        return store
 class Embedder:
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        self.model = SentenceTransformer(model_name)
        self.dim = self.model.get_embedding_dimension()
    def embed(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    def embed_query(self, query: str) -> np.ndarray:
        return self.model.encode([query], show_progress_bar=False, convert_to_numpy=True)[0]
 class LLMClient:
    def __init__(
        self,
        api_key: str,
        base_url: str = "https://api.openai.com/v1",
        model: str = "gpt-4o-mini",
    ):
        self.api_key = api_key
        self.base_url = base_url.rstrip("/")
        self.model = model
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        }
    def generate(self, prompt: str, temperature: float = 0.1, max_tokens: int = 1000) -> str:
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": self._system_prompt()},
                {"role": "user", "content": prompt},
            ],
            "temperature": temperature,
            "max_tokens": max_tokens,
        }
        resp = requests.post(
            f"{self.base_url}/chat/completions",
            headers=self.headers,
            json=payload,
            timeout=60,
        )
        resp.raise_for_status()
        return resp.json()["choices"][0]["message"]["content"].strip()
    def _system_prompt(self) -> str:
        return """Bạn là một trợ lý AI chuyên về Orange Pi, sử dụng dữ liệu từ blog orangepi.vn (nhà phân phối chính thức Orange Pi tại Việt Nam).
 NHIỆM VỤ:
 - Trả lời câu hỏi của người dùng CHỈ DỰA TRÊN THÔNG TIN ĐƯỢC CUNG CẤP trong phần "NGUYÊN LIỆU".
 - Nếu thông tin không có trong nguyên liệu, hãy trả lời: "Không có thông tin trong dữ liệu."
 - KHÔNG được bịa đặt, suy diễn hoặc sử dụng kiến thức bên ngoài.
 - Trả lời bằng tiếng Việt, ngắn gọn, chính xác.
 - Trích dẫn nguồn (title + URL) khi có thể.
 NGUYÊN LIỆU:
 {context}"""
 class RAGPipeline:
    def __init__(
        self,
        data_dir: Path,
        index_dir: Path,
        embedder: Embedder,
        llm: LLMClient,
        top_k: int = 5,
    ):
        self.data_dir = data_dir
        self.index_dir = index_dir
        self.embedder = embedder
        self.llm = llm
        self.top_k = top_k
        self.store: Optional[VectorStore] = None
    def build_index(self):
        print("Loading chunks...")
        chunks = self._load_chunks()
        print(f"Loaded {len(chunks)} chunks")
        print("Generating embeddings...")
        texts = [c.content for c in chunks]
        embeddings = self.embedder.embed(texts)
        print("Building FAISS index...")
        self.store = VectorStore(self.embedder.dim)
        self.store.add(chunks, embeddings)
        print("Saving index...")
        self.index_dir.mkdir(parents=True, exist_ok=True)
        self.store.save(self.index_dir)
        print(f"Index saved to {self.index_dir}")
    def load_index(self):
        print("Loading index...")
        self.store = VectorStore.load(self.index_dir, self.embedder.dim)
        print(f"Loaded {len(self.store.chunks)} chunks")
    def _load_chunks(self) -> List[Chunk]:
        chunks = []
        chunks_path = self.data_dir / "chunks.jsonl"
        with open(chunks_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                chunks.append(Chunk(
                    chunk_id=data["chunk_id"],
                    article_id=data["article_id"],
                    content=data["content"],
                    section=data.get("section"),
                    title=data["title"],
                    url=data["url"],
                    language=data["language"],
                    metadata=data["metadata"],
                ))
        return chunks
    def query(self, question: str) -> str:
        if self.store is None:
            raise RuntimeError("Index not loaded. Call build_index() or load_index() first.")
        query_emb = self.embedder.embed_query(question)
        results = self.store.search(query_emb, top_k=self.top_k)
        if not results:
            return "Không có thông tin trong dữ liệu."
        context_parts = []
        for i, chunk in enumerate(results, 1):
            source = f"[{i}] {chunk.title} ({chunk.url})"
            if chunk.section:
                source += f" - Section: {chunk.section}"
            context_parts.append(f"{source}\n{chunk.content}")
        context = "\n\n---\n\n".join(context_parts)
        prompt = f"NGUYÊN LIỆU:\n{context}\n\nCÂU HỎI: {question}"
        return self.llm.generate(prompt)
 def main():
    # Force UTF-8 output on Windows
    if hasattr(sys.stdout, 'reconfigure'):
        sys.stdout.reconfigure(encoding='utf-8')
    if hasattr(sys.stderr, 'reconfigure'):
        sys.stderr.reconfigure(encoding='utf-8')
    parser = argparse.ArgumentParser(description="OrangePi RAG Application")
    parser.add_argument("--data-dir", type=Path, default=Path("."), help="Directory with chunks.jsonl")
    parser.add_argument("--index-dir", type=Path, default=Path("./rag_index"), help="FAISS index directory")
    parser.add_argument("--build", action="store_true", help="Build index from chunks")
    parser.add_argument("--query", type=str, help="Query to answer")
    parser.add_argument("--interactive", action="store_true", help="Interactive chat mode")
    parser.add_argument("--retrieve-only", action="store_true", help="Test retrieval without LLM")
    parser.add_argument("--top-k", type=int, default=5, help="Number of chunks to retrieve")
    parser.add_argument("--embed-model", default="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    parser.add_argument("--llm-model", default="gpt-4o-mini")
    parser.add_argument("--llm-base-url", default="https://api.openai.com/v1")
    args = parser.parse_args()
    load_dotenv()
    embedder = Embedder(args.embed_model)
    if args.build:
        # Build doesn't need LLM
        pipeline = RAGPipeline(args.data_dir, args.index_dir, embedder, None, args.top_k)
        pipeline.build_index()
        return 0
    if args.retrieve_only:
        # Retrieval test without LLM
        pipeline = RAGPipeline(args.data_dir, args.index_dir, embedder, None, args.top_k)
        pipeline.load_index()
        query_emb = embedder.embed_query(args.query)
        results = pipeline.store.search(query_emb, top_k=args.top_k)
        for i, chunk in enumerate(results, 1):
            print(f"\n--- Result {i} (score: {chunk.metadata.get('similarity_score', 0):.4f}) ---")
            print(f"Title: {chunk.title}")
            print(f"URL: {chunk.url}")
            if chunk.section:
                print(f"Section: {chunk.section}")
            print(f"Content: {chunk.content[:500]}...".encode('utf-8', errors='replace').decode('utf-8'))
        return 0
    api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("LLM_API_KEY")
    if not api_key:
        print("ERROR: Set OPENAI_API_KEY or LLM_API_KEY in environment or .env file")
        print("       Or use --retrieve-only to test retrieval without LLM")
        return 1
    llm = LLMClient(api_key, args.llm_base_url, args.llm_model)
    pipeline = RAGPipeline(args.data_dir, args.index_dir, embedder, llm, args.top_k)
    if args.build:
        pipeline.build_index()
        return 0
    pipeline.load_index()
    if args.query:
        print(pipeline.query(args.query))
        return 0
    if args.interactive:
        print("OrangePi RAG - Interactive mode (Ctrl+C to exit)")
        print("=" * 50)
        while True:
            try:
                question = input("\n❓ Câu hỏi: ").strip()
                if not question:
                    continue
                answer = pipeline.query(question)
                print(f"\n🤖 Trả lời: {answer}")
            except KeyboardInterrupt:
                print("\nTạm biệt!")
                break
            except EOFError:
                break
        return 0
    parser.print_help()
    return 1
 if __name__ == "__main__":
    raise SystemExit(main())
@@ -0,0 +1,6 @@
 sentence-transformers>=2.2.0
 faiss-cpu>=1.7.4
 numpy>=1.24.0
 requests>=2.31.0
 python-dotenv>=1.0.0
 tqdm>=4.65.0