What is RAG?
Retrieval-Augmented Generation (RAG) is a technique that enhances Large Language Models (LLMs) by providing them with relevant context from external knowledge sources. Instead of relying solely on the model's training data, RAG:
- Retrieves relevant documents based on the user's query
- Augments the prompt with this context
- Generates a response grounded in real data
This approach reduces hallucinations, provides citations, and keeps responses up-to-date with current information.
Why WikiRest for RAG?
WikiRest is purpose-built for RAG applications:
- Pre-chunked passages - ~500 tokens each, perfect for context windows
- Sub-50ms search - Real-time retrieval without blocking
- Clean text - No HTML or wikitext to parse
- Source URLs - Every chunk includes attribution
- 6+ million articles - Comprehensive knowledge coverage
Basic Implementation
Here's a simple RAG implementation in Python:
import requests
import openai
# Configuration
WIKIREST_API_KEY = "your_wikirest_key"
OPENAI_API_KEY = "your_openai_key"
WIKIREST_URL = "https://api.wikirest.com/v1"
def retrieve_context(query: str, limit: int = 5) -> list[dict]:
"""Retrieve relevant Wikipedia passages for a query."""
response = requests.get(
f"{WIKIREST_URL}/search",
params={"q": query, "limit": limit},
headers={"X-API-Key": WIKIREST_API_KEY}
)
return response.json()["hits"]
def build_prompt(query: str, context: list[dict]) -> str:
"""Build a prompt with retrieved context."""
context_text = "\n\n".join([
f"[{c['title']}]\n{c['text']}"
for c in context
])
return f"""Answer the question using the provided context.
Cite your sources using [Title] format.
Context:
{context_text}
Question: {query}
Answer:"""
def generate_response(query: str) -> str:
"""Generate a RAG response."""
# 1. Retrieve relevant context
context = retrieve_context(query)
# 2. Build the augmented prompt
prompt = build_prompt(query, context)
# 3. Generate response
client = openai.OpenAI(api_key=OPENAI_API_KEY)
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=1000
)
return response.choices[0].message.content
# Example usage
if __name__ == "__main__":
question = "What is quantum entanglement?"
answer = generate_response(question)
print(answer) Advanced Patterns
Multi-Query Retrieval
Expand your search by generating multiple query variations:
def expand_query(original_query: str) -> list[str]:
"""Generate query variations for better retrieval."""
client = openai.OpenAI(api_key=OPENAI_API_KEY)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{
"role": "user",
"content": f"""Generate 3 different search queries to find information about:
"{original_query}"
Return only the queries, one per line."""
}]
)
queries = response.choices[0].message.content.strip().split("\n")
return [original_query] + queries[:3]
def retrieve_multi_query(query: str, limit: int = 3) -> list[dict]:
"""Retrieve using multiple query variations."""
queries = expand_query(query)
all_results = []
seen_ids = set()
for q in queries:
results = retrieve_context(q, limit=limit)
for r in results:
if r["id"] not in seen_ids:
all_results.append(r)
seen_ids.add(r["id"])
return all_results[:limit * 2] # Return more diverse results Reranking Results
Use a reranker to improve retrieval quality:
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
def rerank_results(query: str, results: list[dict], top_k: int = 5) -> list[dict]:
"""Rerank results using a cross-encoder."""
pairs = [(query, r["text"]) for r in results]
scores = reranker.predict(pairs)
# Sort by score descending
ranked = sorted(
zip(results, scores),
key=lambda x: x[1],
reverse=True
)
return [r for r, _ in ranked[:top_k]] Optimization Tips
Token Optimization
- WikiRest chunks are ~500 tokens - plan your context budget accordingly
- For GPT-4 (8k context): Use 5-8 chunks max
- For GPT-4 Turbo (128k): Can use 20+ chunks for comprehensive answers
- Use
crop_lengthparameter to limit text size
Caching Strategy
- Cache search results for repeated queries (Wikipedia changes infrequently)
- Use the
/v1/changesendpoint to invalidate stale cache - Consider 1-hour TTL for most use cases
Full Working Example
Here's a complete RAG chatbot implementation:
"""
WikiRest RAG Chatbot
A complete implementation of a RAG-powered Wikipedia chatbot.
"""
import requests
import openai
from typing import Optional
from dataclasses import dataclass
@dataclass
class WikiChunk:
id: str
title: str
text: str
section: Optional[str]
url: str
class WikiRAG:
def __init__(self, wikirest_key: str, openai_key: str):
self.wikirest_key = wikirest_key
self.openai_client = openai.OpenAI(api_key=openai_key)
self.api_url = "https://api.wikirest.com/v1"
def search(self, query: str, limit: int = 5) -> list[WikiChunk]:
"""Search Wikipedia for relevant passages."""
response = requests.get(
f"{self.api_url}/search",
params={"q": query, "limit": limit},
headers={"X-API-Key": self.wikirest_key}
)
response.raise_for_status()
return [
WikiChunk(
id=hit["id"],
title=hit["title"],
text=hit["text"],
section=hit.get("section"),
url=hit["url"]
)
for hit in response.json()["hits"]
]
def format_context(self, chunks: list[WikiChunk]) -> str:
"""Format chunks for the LLM prompt."""
formatted = []
for chunk in chunks:
header = f"[{chunk.title}"
if chunk.section:
header += f" - {chunk.section}"
header += "]"
formatted.append(f"{header}\n{chunk.text}")
return "\n\n---\n\n".join(formatted)
def format_sources(self, chunks: list[WikiChunk]) -> str:
"""Format source citations."""
sources = []
seen = set()
for chunk in chunks:
if chunk.title not in seen:
sources.append(f"- [{chunk.title}]({chunk.url})")
seen.add(chunk.title)
return "\n".join(sources)
def answer(
self,
question: str,
num_chunks: int = 5,
model: str = "gpt-4"
) -> dict:
"""Generate a RAG answer with sources."""
# Retrieve relevant context
chunks = self.search(question, limit=num_chunks)
if not chunks:
return {
"answer": "I couldn't find relevant information.",
"sources": []
}
# Build prompt
context = self.format_context(chunks)
prompt = f"""You are a helpful assistant that answers questions
using Wikipedia knowledge. Answer based on the provided context.
Be accurate and cite sources using [Article Title] format.
Context:
{context}
Question: {question}
Provide a comprehensive answer with citations:"""
# Generate response
response = self.openai_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=1000
)
return {
"answer": response.choices[0].message.content,
"sources": self.format_sources(chunks),
"chunks_used": len(chunks)
}
# Example usage
if __name__ == "__main__":
rag = WikiRAG(
wikirest_key="your_key_here",
openai_key="your_openai_key"
)
result = rag.answer("Explain how black holes form")
print("Answer:")
print(result["answer"])
print("\nSources:")
print(result["sources"])