Tutorial: Web RAG (E09)¶
This tutorial corresponds to the example file examples/E09_web_rag_example.py
.
It demonstrates how to use the RAG pipeline to ingest data directly from the web. It shows how to:
- Configure the WebPageLoader
plugin (aliased as web_page
).
- Use genie.rag.index_web_page()
to fetch content from a URL, extract its main text, and index it into a vector store.
- Perform a semantic search over the ingested web content.
Example Code¶
examples/E09_web_rag_example.py¶
""" Example: RAG with WebPageLoader using Genie Facade
This example demonstrates indexing content from a web page and performing a similarity search using the Genie facade and FeatureSettings.
To Run:
1. Ensure Genie Tooling is installed (poetry install --all-extras
).
You'll need dependencies for web loading (beautifulsoup4, trafilatura),
local RAG (sentence-transformers, faiss-cpu).
2. Run from the root of the project:
poetry run python examples/E09_web_rag_example.py
"""
import asyncio
import logging
from typing import Optional
from genie_tooling.config.features import FeatureSettings from genie_tooling.config.models import MiddlewareConfig from genie_tooling.genie import Genie
async def run_web_rag_demo(): print("--- Web Page RAG Demo ---")
# URL to index (Python's Getting Started page as an example)
web_page_url = "https://www.python.org/about/gettingstarted/"
collection_name = "python_docs_web_collection_e09"
app_config = MiddlewareConfig(
features=FeatureSettings(
llm="none", # Not needed for RAG indexing/search focus
command_processor="none",
rag_loader="web_page", # Use WebPageLoader by default
rag_embedder="sentence_transformer",
rag_vector_store="faiss", # In-memory FAISS for this demo
),
# Optionally configure WebPageLoader (e.g., to use trafilatura)
document_loader_configurations={
"web_page_loader_v1": { # Canonical ID
"use_trafilatura": True # Attempt to use trafilatura for better content extraction
}
}
)
genie: Optional[Genie] = None
try:
print("\nInitializing Genie for Web RAG...")
genie = await Genie.create(config=app_config)
print("Genie initialized!")
# Index the web page
print(f"\nIndexing web page: {web_page_url} into collection '{collection_name}'...")
index_result = await genie.rag.index_web_page(
web_page_url,
collection_name=collection_name
)
print(f"Indexing result: {index_result}")
if index_result.get("status") != "success":
print(f"ERROR: Indexing failed: {index_result.get('message')}")
return
# Perform a search
query = "What are Python libraries?"
print(f"\nPerforming search for: '{query}' in '{collection_name}'")
search_results = await genie.rag.search(
query,
collection_name=collection_name,
top_k=2
)
if not search_results:
print("No search results found.")
else:
print("\nSearch Results:")
for i, chunk in enumerate(search_results):
print(f" --- Result {i+1} (Score: {chunk.score:.4f}) ---")
print(f" Content: {chunk.content[:300]}...") # Print snippet
print(f" Source: {chunk.metadata.get('url')}")
print(" ------------------------------------")
except Exception as e:
print(f"\nAn error occurred: {e}")
logging.exception("Web RAG demo error details:")
finally:
if genie:
await genie.close()
print("\nGenie torn down.")
if name == "main": logging.basicConfig(level=logging.INFO) # logging.getLogger("genie_tooling").setLevel(logging.DEBUG) # For detailed logs asyncio.run(run_web_rag_demo())