LangGraph Research Eval¶
LangChain Agent — Dataset-Driven Research Evaluation Example
"""
LangChain Agent — Dataset-Driven Research Evaluation Example
Demonstrates a dataset-driven evaluation scenario with:
- LangChain createReactAgent with multiple tools
- Dataset with structured inputs: { city, data }, messages, expected_output
- System instructions built from dataset inputs (city + data)
- User prompt extracted from dataset messages
- OpenResponses output with input: [system, user] messages
- Multiple evaluators: correctness, tool-usage, quality rubric,
completeness, and city-relevance
- Path-based organization for the Orq dashboard
- Parallel processing
Prerequisites:
- Set OPENAI_API_KEY and ORQ_API_KEY environment variables
- Upload a dataset to Orq with columns:
"city" — city name (string)
"data" — contextual data about the city (string)
"messages" — conversation messages (the user prompt as a message)
"expected_output" — the expected answer (string, optional)
Usage:
ORQ_API_KEY=... OPENAI_API_KEY=... DATASET_ID=... python examples/lib/integrations/langchain/langgraph_research_eval.py
"""
import asyncio
import os
import re
from typing import Any
from urllib.parse import quote_plus
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
# NOTE: langgraph < 2.0 path. create_react_agent moved to `langchain.agents.create_agent`
# in langgraph V1.0 and is removed in V2.0 — update this import when bumping to langgraph 2.x.
from langgraph.prebuilt import create_react_agent
from evaluatorq import DataPoint, ScorerParameter, evaluatorq
from evaluatorq.integrations.langchain_integration import wrap_langgraph_agent
# ────────────────────────────────────────────────
# Helpers — extract text and tool calls from OpenResponses output
# ────────────────────────────────────────────────
def extract_text(output: Any) -> str:
if not isinstance(output, dict):
return ""
items: list[dict[str, Any]] = output.get("output", [])
message = next((item for item in items if item.get("type") == "message"), None)
if not message:
return ""
content_array: list[dict[str, Any]] = message.get("content", [])
text_content = next((c for c in content_array if c.get("type") == "output_text"), None)
return text_content.get("text", "") if text_content else ""
def extract_tool_calls(output: Any) -> list[dict[str, Any]]:
if not isinstance(output, dict):
return []
items: list[dict[str, Any]] = output.get("output", [])
return [item for item in items if item.get("type") == "function_call"]
# ────────────────────────────────────────────────
# Build system instructions from dataset inputs
# ────────────────────────────────────────────────
def build_system_instructions(city: str, data: str) -> str:
return "\n\n".join([
f"You are an expert analyst for the city of {city}.",
f"Use the following context data to inform your answers:\n{data}",
"Always ground your response in the provided data.",
"You MUST use your tools (search, calculator, or fact_check) at least once before answering. Search for additional information to supplement the provided data, verify claims with the fact-checker, or use the calculator for any numerical analysis.",
])
# ────────────────────────────────────────────────
# Tools
# ────────────────────────────────────────────────
@tool
def search(query: str) -> dict[str, Any]:
"""Search the web for information on a topic."""
return {
"results": [
{
"title": f"Top result for: {query}",
"snippet": (
f"Comprehensive information about {query}. According to recent studies, "
"this topic has significant implications in multiple domains."
),
"url": f"https://example.com/search?q={quote_plus(query)}",
},
{
"title": f"Academic paper: {query}",
"snippet": (
f"A peer-reviewed analysis of {query} published in 2024 found that the "
"key factors include scalability, reliability, and cost-effectiveness."
),
"url": f"https://example.com/papers/{quote_plus(query)}",
},
],
}
@tool
def calculator(expression: str) -> dict[str, Any]:
"""Evaluate a mathematical expression."""
# NOTE: Uses a hard-coded lookup for demo purposes.
# In production, use a dedicated math expression library instead.
known_expressions: dict[str, float] = {
"2 + 2": 4,
"10 * 5": 50,
"100 / 4": 25,
"3.14 * 2": 6.28,
"2 ** 10": 1024,
"(5 + 3) * 2": 16,
"1000 - 750": 250,
}
result = known_expressions.get(expression.strip())
if result is not None:
return {"expression": expression, "result": result, "error": None}
return {"expression": expression, "result": None, "error": "Expression not in demo lookup table"}
@tool
def fact_check(claim: str) -> dict[str, Any]:
"""Verify a factual claim against known sources."""
confidence = 0.85
return {
"claim": claim,
"verdict": "supported" if confidence >= 0.85 else "partially_supported",
"confidence": round(confidence, 2),
"sources": [f"https://example.com/fact-check/{quote_plus(claim[:30])}"],
}
# ────────────────────────────────────────────────
# LangChain agent — createReactAgent
# ────────────────────────────────────────────────
tools = [search, calculator, fact_check]
model = ChatOpenAI(model="gpt-4o", temperature=0)
agent = create_react_agent(model, tools)
# ────────────────────────────────────────────────
# Evaluators
# ────────────────────────────────────────────────
async def correctness_scorer(params: ScorerParameter) -> dict[str, Any]:
"""Checks correctness against expected output when available."""
text = extract_text(params["output"]).lower()
expected = params["data"].expected_output
if not expected:
return {
"value": 1 if len(text) > 20 else 0.5,
"explanation": "No expected output — scored on response substance",
}
expected_str = str(expected).lower()
contains = expected_str in text
return {
"value": 1 if contains else 0,
"pass": contains,
"explanation": (
f'Output contains expected answer "{expected}"'
if contains
else f'Expected "{expected}" not found in output'
),
}
async def tool_usage_scorer(params: ScorerParameter) -> dict[str, Any]:
"""Validates that the agent actually used its tools."""
calls = extract_tool_calls(params["output"])
tool_names = list(set(c.get("name", "") for c in calls))
score = min(len(tool_names) / 2, 1.0)
return {
"value": round(score, 2),
"explanation": (
f"Used {len(calls)} tool call(s) across {len(tool_names)} "
f"distinct tool(s): {', '.join(tool_names) or 'none'}"
),
}
async def quality_rubric_scorer(params: ScorerParameter) -> dict[str, Any]:
"""Multi-criteria quality rubric (structured result)."""
text = extract_text(params["output"])
words = [w for w in text.split() if w]
sentences = [s for s in re.split(r"[.!?]+", text) if s.strip()]
completeness = min(len(words) / 50, 1.0)
avg_sentence_len = len(words) / len(sentences) if sentences else 0
if 10 <= avg_sentence_len <= 25:
clarity = 0.95
elif avg_sentence_len > 0:
clarity = 0.5
else:
clarity = 0.1
has_structure = 0.9 if re.search(r"(\n[-•*]|\n\d+\.|\n\n)", text) else 0.5
return {
"value": {
"type": "rubric",
"value": {
"completeness": round(completeness, 2),
"clarity": round(clarity, 2),
"structure": round(has_structure, 2),
},
},
"explanation": "Multi-criteria quality rubric (completeness, clarity, structure)",
}
async def completeness_scorer(params: ScorerParameter) -> dict[str, Any]:
"""Boolean pass/fail — the response must not be empty or a refusal."""
text = extract_text(params["output"])
words = [w for w in text.split() if w]
is_refusal = bool(re.search(r"i (can't|cannot|am unable to)", text, re.IGNORECASE))
is_complete = len(words) >= 10 and not is_refusal
return {
"value": is_complete,
"pass": is_complete,
"explanation": (
f"Complete response ({len(words)} words)"
if is_complete
else (
"Agent refused to answer"
if is_refusal
else f"Incomplete response (only {len(words)} words)"
)
),
}
async def city_relevance_scorer(params: ScorerParameter) -> dict[str, Any]:
"""Checks that the response references the city from the dataset input."""
text = extract_text(params["output"]).lower()
city = str(params["data"].inputs.get("city", ""))
mentions_city = city.lower() in text
return {
"value": 1 if mentions_city else 0,
"pass": mentions_city,
"explanation": (
f'Response references the target city "{city}"'
if mentions_city
else f'Response does not mention "{city}"'
),
}
# ────────────────────────────────────────────────
# Run the evaluation
# ────────────────────────────────────────────────
DATASET_ID = os.environ.get("DATASET_ID")
async def main() -> None:
if not DATASET_ID:
raise ValueError("DATASET_ID environment variable is required")
await evaluatorq(
"langchain-research-eval",
description=(
"LangChain research agent evaluation with structured dataset input "
"(city + data), custom instructions, and OpenResponses output"
),
path="Integrations/LangChain",
parallelism=3,
data={"dataset_id": DATASET_ID, "include_messages": True},
jobs=[
wrap_langgraph_agent(
agent,
name="langchain-research-agent",
instructions=lambda data: build_system_instructions(
str(data.inputs.get("city", "")),
str(data.inputs.get("data", "")),
),
),
],
evaluators=[
{"name": "correctness", "scorer": correctness_scorer},
{"name": "tool-usage", "scorer": tool_usage_scorer},
{"name": "quality-rubric", "scorer": quality_rubric_scorer},
{"name": "completeness", "scorer": completeness_scorer},
{"name": "city-relevance", "scorer": city_relevance_scorer},
],
)
if __name__ == "__main__":
asyncio.run(main())