使用 SmoLAgents 构建非洲文化数据集:实验性
社区文章 发布于 2025 年 2 月 7 日

引言
SmoLAgents 提供了一个强大的框架,通过多智能体系统创建丰富的文化数据集。本实现侧重于使用专业的 AI 智能体收集、组织和推理非洲文化知识。
系统架构
!pip install smolagents[litellm] datasets
模型配置
self.model = LiteLLMModel(model_id="gpt-4o-mini")
self.reasoning_model = LiteLLMModel(model_id="o3-mini", reasoning_effort="high")
self.coder_model = LiteLLMModel(
model_id="openrouter/anthropic/claude-3.5-sonnet",
temperature=0.8
)
self.robust_model = LiteLLMModel(model_id="o1")
专业代理
研究代理
- 配备网页搜索和网页访问能力
- 使用高能力模型进行复杂推理
- 最多 6 个处理步骤以进行全面研究
- 访问大量数据处理工具
self.researcher = CodeAgent(
tools=[google_search, visit_webpage],
model=self.coder_model,
max_steps=6,
verbosity_level=3,
additional_authorized_imports=['math', 'queue', 'stat', 'statistics', 're', 'itertools', 'unicodedata', 'collections', 'datetime', 'time', 'random', 'bs4', 'markdownify', 'requests', 'pandas']
)
async def research_cultural_info(self, category: str, topic: str) -> Dict:
try:
research_prompt = f"""
You are an expert researcher on African History
Research and provide comprehensive information about {topic} in African {category}.
Focus on historical context, regional variations, and modern practices.
"""
research_data = self.researcher.run(research_prompt)
structure_prompt = f"""
Based on this research: {research_data}
Create a structured JSON with:
{{
"overview": "brief description",
"historical_context": "historical background",
"regional_variations": ["list of variations by region"],
"cultural_significance": "detailed significance",
"modern_practices": "current adaptations",
"sources": ["list of sources"]
}}
"""
structured_data = await self.generate_with_model(structure_prompt)
return json.loads(structured_data)
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return {}
问答生成代理
- 创建具有文化意识的问题和答案
- 实施难度级别(基础/中级/高级)
- 确保区域代表性
- 保持文化真实性
async def generate_qa_pairs(self, cultural_data: Dict) -> List[Dict]:
try:
qa_prompt = f"""
Based on this cultural information:
{json.dumps(cultural_data, indent=2)}
Generate 6 question-answer pairs in this JSON format:
[{{
"question": "detailed question",
"answer": "comprehensive answer",
"difficulty": "basic|intermediate|advanced",
"category": "historical|practical|conceptual",
"regions": ["relevant African regions"]
}}]
"""
qa_response = await self.generate_with_model(qa_prompt)
return json.loads(qa_response)
except Exception as e:
print(f"QA generation error: {e}")
return []
推理生成代理
- 生成详细的解决方案链
- 分解文化概念
- 提供逐步分析
- 连接历史和现代背景
async def generate_reasoning(self, qa_pairs: List[Dict]) -> List[Dict]:
try:
reasoning_prompt = f"""
For these Q&A pairs:
{json.dumps(qa_pairs, indent=2)}
Generate detailed reasoning chains in this JSON format:
[{{
"question": "original question",
"reasoning_steps": [
"step 1: initial understanding",
"step 2: cultural context",
"step 3: analysis",
"step 4: conclusion"
],
"final_answer": "detailed answer",
"cultural_context": "relevant cultural background",
"sources": ["reference sources"]
}}]
"""
reasoning_data = await self.generate_with_model(reasoning_prompt)
return json.loads(reasoning_data)
except Exception as e:
print(f"Reasoning generation error: {e}")
return []
数据收集流程
文化研究阶段
{
"overview": "brief description",
"historical_context": "historical background",
"regional_variations": ["variations by region"],
"cultural_significance": "detailed significance",
"modern_practices": "current adaptations",
"sources": ["reference sources"]
}
问答生成阶段
{
"question": "detailed question",
"answer": "comprehensive answer",
"difficulty": "basic|intermediate|advanced",
"category": "historical|practical|conceptual",
"regions": ["relevant African regions"]
}
推理链生成
{
"question": "original question",
"reasoning_steps": [
"step 1: initial understanding",
"step 2: cultural context",
"step 3: analysis",
"step 4: conclusion"
],
"final_answer": "detailed answer",
"cultural_context": "relevant background"
}
完整代码
import os
from typing import Dict, List, Any
import json
from datetime import datetime
import asyncio
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
class AfricanCultureDataGenerator:
def __init__(self, api_key: str):
# Initialize with explicit API key
os.environ["OPENAI_API_KEY"] = api_key
self.model = LiteLLMModel(
model_id="gpt-4o-mini",
)
self.reasoning_model = LiteLLMModel(
model_id="o3-mini",
reasoning_effort="high",
)
self.coder_model = LiteLLMModel(
model_id="openrouter/anthropic/claude-3.5-sonnet",
api_key=os.environ["OPENROUTER_API_KEY"],
temperature=0.8
)
self.robust_model = LiteLLMModel(
model_id="o1",
)
# Research Agent
self.researcher = CodeAgent(
tools=[google_search, visit_webpage],
model=self.coder_model,
max_steps=6,
verbosity_level=3,
additional_authorized_imports=['math', 'queue', 'stat', 'statistics', 're', 'itertools', 'unicodedata', 'collections', 'datetime', 'time', 'random', 'bs4', 'markdownify', 'requests', 'pandas']
)
self.categories = {
"traditions": [
"marriage ceremonies",
"naming ceremonies",
"initiation rituals"
"storytelling",
"science"
],
"music": [
"traditional instruments",
"musical styles",
"dance forms",
"ceremonial music"
],
"social_structures": [
"family systems",
"leadership roles",
"age groups",
"community organization"
],
"cultural_values": [
"respect for elders",
"community solidarity",
"spiritual beliefs",
"oral traditions"
]
}
async def generate(self, prompt: str) -> str:
agent = CodeAgent(
tools=[],
model=self.model,
max_steps=6,
additional_authorized_imports=['bs4', 'stat', 'statistics', 'unicodedata', 'collections', 'requests', 'time', 'json', 'time', 'os','random', 'math', 'queue', 'markdownify', 're', 'itertools', 'datetime', 'pandas']
)
# Get the agent's response.
response = agent.run(prompt)
# If the response is a dictionary, convert it to a JSON string.
if isinstance(response, dict):
return json.dumps(response)
# Otherwise, return the response as is.
return response
async def generate_with_model(self, prompt: str) -> str:
try:
response = await self.generate(prompt)
return response if response else "{}"
except Exception as e:
print(f"Model generation error: {e}")
return "{}"
async def research_cultural_info(self, category: str, topic: str) -> Dict:
try:
research_prompt = f"""
You are an expert researcher on African History
Research and provide comprehensive information about {topic} in African {category}.
Focus on historical context, regional variations, and modern practices.
"""
research_data = self.researcher.run(research_prompt)
structure_prompt = f"""
Based on this research: {research_data}
Create a structured JSON with:
{{
"overview": "brief description",
"historical_context": "historical background",
"regional_variations": ["list of variations by region"],
"cultural_significance": "detailed significance",
"modern_practices": "current adaptations",
"sources": ["list of sources"]
}}
"""
structured_data = await self.generate_with_model(structure_prompt)
return json.loads(structured_data)
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")
return {}
async def generate_qa_pairs(self, cultural_data: Dict) -> List[Dict]:
try:
qa_prompt = f"""
Based on this cultural information:
{json.dumps(cultural_data, indent=2)}
Generate 6 question-answer pairs in this JSON format:
[{{
"question": "detailed question",
"answer": "comprehensive answer",
"difficulty": "basic|intermediate|advanced",
"category": "historical|practical|conceptual",
"regions": ["relevant African regions"]
}}]
"""
qa_response = await self.generate_with_model(qa_prompt)
return json.loads(qa_response)
except Exception as e:
print(f"QA generation error: {e}")
return []
async def generate_reasoning(self, qa_pairs: List[Dict]) -> List[Dict]:
try:
reasoning_prompt = f"""
For these Q&A pairs:
{json.dumps(qa_pairs, indent=2)}
Generate detailed reasoning chains in this JSON format:
[{{
"question": "original question",
"reasoning_steps": [
"step 1: initial understanding",
"step 2: cultural context",
"step 3: analysis",
"step 4: conclusion"
],
"final_answer": "detailed answer",
"cultural_context": "relevant cultural background",
"sources": ["reference sources"]
}}]
"""
reasoning_data = await self.generate_with_model(reasoning_prompt)
return json.loads(reasoning_data)
except Exception as e:
print(f"Reasoning generation error: {e}")
return []
async def process_category(self, category: str, topic: str) -> Dict:
try:
cultural_data = await self.research_cultural_info(category, topic)
qa_pairs = await self.generate_qa_pairs(cultural_data)
reasoning_data = await self.generate_reasoning(qa_pairs)
return {
"category": category,
"topic": topic,
"cultural_data": cultural_data,
"qa_pairs": qa_pairs,
"reasoning_data": reasoning_data,
"metadata": {
"generated_at": datetime.now().isoformat(),
"model": "gpt-family/o3",
"version": "1.0"
}
}
except Exception as e:
print(f"Error processing {category}/{topic}: {e}")
return {"error": str(e)}
async def generate_dataset(self):
dataset = {}
for category, topics in self.categories.items():
dataset[category] = {}
for topic in topics:
print(f"Processing {category}/{topic}...")
dataset[category][topic] = await self.process_category(category, topic)
await asyncio.sleep(2)
with open("african_cultural_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, indent=2, ensure_ascii=False)
return dataset
async def main():
api_key = os.environ["OPENAI_API_KEY"]
generator = AfricanCultureDataGenerator(api_key)
dataset = await generator.generate_dataset()
print("Dataset generation complete!")
if __name__ == "__main__":
await main()
结论
本实现展示了专业 AI 代理在创建丰富的非洲文化数据集方面的强大功能,利用多代理架构进行研究、问答生成和推理链。虽然目前的实现前景广阔,但过渡到带有编排器的 E2B 代码执行器将带来多重优势:
- 更好的执行控制和资源管理
- 改进的错误处理和 API 密钥管理
- 并行处理文化数据收集
- 可扩展的基础设施,适用于更大的数据集
- 增强的监控和验证能力
下一阶段应侧重于:
- 实施编排器以管理代理工作流
- 利用 E2B 的代码执行环境进行可靠处理
- 增加强大的验证机制以确保文化准确性
- 在不同地区实施并行数据收集
- 通过分布式处理增强推理链生成
这种演变将保持当前系统的文化真实性,同时增加企业级的可靠性和可扩展性。