import requests
import pandas as pd
from time import sleep

def use_semantic_scholar_enhanced():
    """增强版:获取包含h-index等完整学术指标的作者数据"""
    
    base_url = "<https://api.semanticscholar.org/graph/v1>"

    # 扩展的LLM相关关键词
    ai_keywords = [
        "large language model", "LLM", "GPT", "BERT", "RLHF",
        "prompt engineering", "LLaMA", "ChatGPT", "mixture of experts"
    ]

    def search_papers_semantic(query, limit=100, offset=0):
        """搜索论文(支持分页)"""
        params = {
            "query": query,
            "limit": limit,
            "offset": offset,
            "fields": "title,authors,citationCount,year,paperId"
        }
        response = requests.get(f"{base_url}/paper/search", params=params)
        return response.json().get("data", []) if response.status_code == 200 else []

    def get_author_details(author_id):
        """获取作者完整指标(与图片结构一致)"""
        url = f"{base_url}/author/{author_id}"
        params = {
            "fields": "name,affiliations,paperCount,citationCount,hIndex,papers.title,papers.year"
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            return response.json()
        return None

    all_authors = {}

    # 多关键词搜索
    for keyword in ai_keywords:
        print(f"Processing: {keyword}")
        for offset in [0, 100, 200]:  # 每个关键词最多300篇
            papers = search_papers_semantic(keyword, limit=100, offset=offset)
            if not papers: break

            for paper in papers:
                for author in paper.get("authors", []):
                    author_id = author.get("authorId")
                    if not author_id: continue

                    # 首次见到作者时获取完整数据
                    if author_id not in all_authors:
                        details = get_author_details(author_id)
                        if not details: continue

                        # 完全参照图片中的字典结构
                        all_authors[author_id] = {
                            "name": details.get("name"),
                            "affiliations": details.get("affiliations", []),
                            "paper_count": details.get("paperCount", 0),
                            "citation_count": details.get("citationCount", 0),
                            "h_index": details.get("hIndex", 0),
                            "recent_papers": [
                                {"title": p["title"], "year": p["year"]}
                                for p in details.get("papers", [])[:3]  # 最近3篇论文
                            ]
                        }
                        sleep(0.5)  # 请求间隔

    # 转换为DataFrame并保存
    df = pd.DataFrame(all_authors.values())
    df.to_csv('enhanced_llm_authors.csv', index=False)
    return df

# 执行示例
if __name__ == "__main__":
    df = use_semantic_scholar_enhanced()
    print(f"获取到 {len(df)} 位作者,包含以下字段:")
    print(df.columns)
    print("\\n示例数据:")
    print(df.head())