使用 SmoLAgents 构建非洲文化数据集:实验性

社区文章 发布于 2025 年 2 月 7 日

image/jpeg

引言

SmoLAgents 提供了一个强大的框架,通过多智能体系统创建丰富的文化数据集。本实现侧重于使用专业的 AI 智能体收集、组织和推理非洲文化知识。

系统架构

!pip install smolagents[litellm] datasets

模型配置

self.model = LiteLLMModel(model_id="gpt-4o-mini")
self.reasoning_model = LiteLLMModel(model_id="o3-mini", reasoning_effort="high")
self.coder_model = LiteLLMModel(
    model_id="openrouter/anthropic/claude-3.5-sonnet",
    temperature=0.8
)
self.robust_model = LiteLLMModel(model_id="o1")

专业代理

研究代理

  • 配备网页搜索和网页访问能力
  • 使用高能力模型进行复杂推理
  • 最多 6 个处理步骤以进行全面研究
  • 访问大量数据处理工具
self.researcher = CodeAgent(
            tools=[google_search, visit_webpage],
            model=self.coder_model,
            max_steps=6,
            verbosity_level=3,
            additional_authorized_imports=['math', 'queue', 'stat', 'statistics', 're', 'itertools', 'unicodedata', 'collections', 'datetime', 'time', 'random', 'bs4', 'markdownify', 'requests', 'pandas']
        )

async def research_cultural_info(self, category: str, topic: str) -> Dict:
        try:
            research_prompt = f"""
            You are an expert researcher on African History
            Research and provide comprehensive information about {topic} in African {category}.
            Focus on historical context, regional variations, and modern practices.
            """
            research_data = self.researcher.run(research_prompt)
            
            structure_prompt = f"""
            Based on this research: {research_data}
            Create a structured JSON with:
            {{
                "overview": "brief description",
                "historical_context": "historical background",
                "regional_variations": ["list of variations by region"],
                "cultural_significance": "detailed significance",
                "modern_practices": "current adaptations",
                "sources": ["list of sources"]
            }}
            """
            structured_data = await self.generate_with_model(structure_prompt)
            return json.loads(structured_data)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            return {}

问答生成代理

  • 创建具有文化意识的问题和答案
  • 实施难度级别(基础/中级/高级)
  • 确保区域代表性
  • 保持文化真实性
async def generate_qa_pairs(self, cultural_data: Dict) -> List[Dict]:
        try:
            qa_prompt = f"""
            Based on this cultural information:
            {json.dumps(cultural_data, indent=2)}
            
            Generate 6 question-answer pairs in this JSON format:
            [{{
                "question": "detailed question",
                "answer": "comprehensive answer",
                "difficulty": "basic|intermediate|advanced",
                "category": "historical|practical|conceptual",
                "regions": ["relevant African regions"]
            }}]
            """
            qa_response = await self.generate_with_model(qa_prompt)
            return json.loads(qa_response)
        except Exception as e:
            print(f"QA generation error: {e}")
            return []

推理生成代理

  • 生成详细的解决方案链
  • 分解文化概念
  • 提供逐步分析
  • 连接历史和现代背景
async def generate_reasoning(self, qa_pairs: List[Dict]) -> List[Dict]:
        try:
            reasoning_prompt = f"""
            For these Q&A pairs:
            {json.dumps(qa_pairs, indent=2)}
            
            Generate detailed reasoning chains in this JSON format:
            [{{
                "question": "original question",
                "reasoning_steps": [
                    "step 1: initial understanding",
                    "step 2: cultural context",
                    "step 3: analysis",
                    "step 4: conclusion"
                ],
                "final_answer": "detailed answer",
                "cultural_context": "relevant cultural background",
                "sources": ["reference sources"]
            }}]
            """
            reasoning_data = await self.generate_with_model(reasoning_prompt)
            return json.loads(reasoning_data)
        except Exception as e:
            print(f"Reasoning generation error: {e}")
            return []

数据收集流程

文化研究阶段

{
    "overview": "brief description",
    "historical_context": "historical background",
    "regional_variations": ["variations by region"],
    "cultural_significance": "detailed significance",
    "modern_practices": "current adaptations",
    "sources": ["reference sources"]
}

问答生成阶段

{
    "question": "detailed question",
    "answer": "comprehensive answer",
    "difficulty": "basic|intermediate|advanced",
    "category": "historical|practical|conceptual",
    "regions": ["relevant African regions"]
}

推理链生成

{
    "question": "original question",
    "reasoning_steps": [
        "step 1: initial understanding",
        "step 2: cultural context",
        "step 3: analysis",
        "step 4: conclusion"
    ],
    "final_answer": "detailed answer",
    "cultural_context": "relevant background"
}

完整代码

import os
from typing import Dict, List, Any
import json
from datetime import datetime
import asyncio
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel

class AfricanCultureDataGenerator:
    def __init__(self, api_key: str):
        # Initialize with explicit API key
        os.environ["OPENAI_API_KEY"] = api_key
        
        self.model = LiteLLMModel(
            model_id="gpt-4o-mini",
        )
        self.reasoning_model = LiteLLMModel(
            model_id="o3-mini",
            reasoning_effort="high",

        )

        self.coder_model = LiteLLMModel(
            model_id="openrouter/anthropic/claude-3.5-sonnet",
            api_key=os.environ["OPENROUTER_API_KEY"],
            temperature=0.8
        )

        self.robust_model = LiteLLMModel(
            model_id="o1",
        )
        
        # Research Agent
        self.researcher = CodeAgent(
            tools=[google_search, visit_webpage],
            model=self.coder_model,
            max_steps=6,
            verbosity_level=3,
            additional_authorized_imports=['math', 'queue', 'stat', 'statistics', 're', 'itertools', 'unicodedata', 'collections', 'datetime', 'time', 'random', 'bs4', 'markdownify', 'requests', 'pandas']
        )
        
        self.categories = {
            "traditions": [
                "marriage ceremonies",
                "naming ceremonies",
                "initiation rituals"
                "storytelling",
                "science"
            ],
            "music": [
                "traditional instruments",
                "musical styles",
                "dance forms",
                "ceremonial music"
            ],
            "social_structures": [
                "family systems",
                "leadership roles",
                "age groups",
               "community organization"
            ],
            "cultural_values": [
                "respect for elders",
                "community solidarity",
                "spiritual beliefs",
                "oral traditions"
            ]
        }
        
    async def generate(self, prompt: str) -> str:
      agent = CodeAgent(
          tools=[], 
          model=self.model,
          max_steps=6,
          additional_authorized_imports=['bs4', 'stat', 'statistics', 'unicodedata', 'collections', 'requests', 'time', 'json', 'time', 'os','random', 'math', 'queue', 'markdownify', 're', 'itertools', 'datetime', 'pandas']
      )
      # Get the agent's response.
      response = agent.run(prompt)
      # If the response is a dictionary, convert it to a JSON string.
      if isinstance(response, dict):
          return json.dumps(response)
      # Otherwise, return the response as is.
      return response
    
    async def generate_with_model(self, prompt: str) -> str:
        try:
            response = await self.generate(prompt)
            return response if response else "{}"
        except Exception as e:
            print(f"Model generation error: {e}")
            return "{}"

    async def research_cultural_info(self, category: str, topic: str) -> Dict:
        try:
            research_prompt = f"""
            You are an expert researcher on African History
            Research and provide comprehensive information about {topic} in African {category}.
            Focus on historical context, regional variations, and modern practices.
            """
            research_data = self.researcher.run(research_prompt)
            
            structure_prompt = f"""
            Based on this research: {research_data}
            Create a structured JSON with:
            {{
                "overview": "brief description",
                "historical_context": "historical background",
                "regional_variations": ["list of variations by region"],
                "cultural_significance": "detailed significance",
                "modern_practices": "current adaptations",
                "sources": ["list of sources"]
            }}
            """
            structured_data = await self.generate_with_model(structure_prompt)
            return json.loads(structured_data)
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {e}")
            return {}

    async def generate_qa_pairs(self, cultural_data: Dict) -> List[Dict]:
        try:
            qa_prompt = f"""
            Based on this cultural information:
            {json.dumps(cultural_data, indent=2)}
            
            Generate 6 question-answer pairs in this JSON format:
            [{{
                "question": "detailed question",
                "answer": "comprehensive answer",
                "difficulty": "basic|intermediate|advanced",
                "category": "historical|practical|conceptual",
                "regions": ["relevant African regions"]
            }}]
            """
            qa_response = await self.generate_with_model(qa_prompt)
            return json.loads(qa_response)
        except Exception as e:
            print(f"QA generation error: {e}")
            return []

    async def generate_reasoning(self, qa_pairs: List[Dict]) -> List[Dict]:
        try:
            reasoning_prompt = f"""
            For these Q&A pairs:
            {json.dumps(qa_pairs, indent=2)}
            
            Generate detailed reasoning chains in this JSON format:
            [{{
                "question": "original question",
                "reasoning_steps": [
                    "step 1: initial understanding",
                    "step 2: cultural context",
                    "step 3: analysis",
                    "step 4: conclusion"
                ],
                "final_answer": "detailed answer",
                "cultural_context": "relevant cultural background",
                "sources": ["reference sources"]
            }}]
            """
            reasoning_data = await self.generate_with_model(reasoning_prompt)
            return json.loads(reasoning_data)
        except Exception as e:
            print(f"Reasoning generation error: {e}")
            return []

    async def process_category(self, category: str, topic: str) -> Dict:
        try:
            cultural_data = await self.research_cultural_info(category, topic)
            qa_pairs = await self.generate_qa_pairs(cultural_data)
            reasoning_data = await self.generate_reasoning(qa_pairs)
            
            return {
                "category": category,
                "topic": topic,
                "cultural_data": cultural_data,
                "qa_pairs": qa_pairs,
                "reasoning_data": reasoning_data,
                "metadata": {
                    "generated_at": datetime.now().isoformat(),
                    "model": "gpt-family/o3",
                    "version": "1.0"
                }
            }
        except Exception as e:
            print(f"Error processing {category}/{topic}: {e}")
            return {"error": str(e)}

    async def generate_dataset(self):
        dataset = {}
        for category, topics in self.categories.items():
            dataset[category] = {}
            for topic in topics:
                print(f"Processing {category}/{topic}...")
                dataset[category][topic] = await self.process_category(category, topic)
                await asyncio.sleep(2)
        
        with open("african_cultural_dataset.json", "w", encoding="utf-8") as f:
            json.dump(dataset, f, indent=2, ensure_ascii=False)
        
        return dataset

async def main():
    api_key =   os.environ["OPENAI_API_KEY"]
    generator = AfricanCultureDataGenerator(api_key)
    dataset = await generator.generate_dataset()
    print("Dataset generation complete!")

if __name__ == "__main__":
  await main()

结论

本实现展示了专业 AI 代理在创建丰富的非洲文化数据集方面的强大功能,利用多代理架构进行研究、问答生成和推理链。虽然目前的实现前景广阔,但过渡到带有编排器的 E2B 代码执行器将带来多重优势:

  1. 更好的执行控制和资源管理
  2. 改进的错误处理和 API 密钥管理
  3. 并行处理文化数据收集
  4. 可扩展的基础设施,适用于更大的数据集
  5. 增强的监控和验证能力

下一阶段应侧重于:

  • 实施编排器以管理代理工作流
  • 利用 E2B 的代码执行环境进行可靠处理
  • 增加强大的验证机制以确保文化准确性
  • 在不同地区实施并行数据收集
  • 通过分布式处理增强推理链生成

这种演变将保持当前系统的文化真实性,同时增加企业级的可靠性和可扩展性。

资源

社区

恭喜老哥
你真是太棒了

注册登录 评论