import requests
import pandas as pd
from time import sleep
def use_semantic_scholar_enhanced():
"""增强版:获取包含h-index等完整学术指标的作者数据"""
base_url = "<https://api.semanticscholar.org/graph/v1>"
# 扩展的LLM相关关键词
ai_keywords = [
"large language model", "LLM", "GPT", "BERT", "RLHF",
"prompt engineering", "LLaMA", "ChatGPT", "mixture of experts"
]
def search_papers_semantic(query, limit=100, offset=0):
"""搜索论文(支持分页)"""
params = {
"query": query,
"limit": limit,
"offset": offset,
"fields": "title,authors,citationCount,year,paperId"
}
response = requests.get(f"{base_url}/paper/search", params=params)
return response.json().get("data", []) if response.status_code == 200 else []
def get_author_details(author_id):
"""获取作者完整指标(与图片结构一致)"""
url = f"{base_url}/author/{author_id}"
params = {
"fields": "name,affiliations,paperCount,citationCount,hIndex,papers.title,papers.year"
}
response = requests.get(url, params=params)
if response.status_code == 200:
return response.json()
return None
all_authors = {}
# 多关键词搜索
for keyword in ai_keywords:
print(f"Processing: {keyword}")
for offset in [0, 100, 200]: # 每个关键词最多300篇
papers = search_papers_semantic(keyword, limit=100, offset=offset)
if not papers: break
for paper in papers:
for author in paper.get("authors", []):
author_id = author.get("authorId")
if not author_id: continue
# 首次见到作者时获取完整数据
if author_id not in all_authors:
details = get_author_details(author_id)
if not details: continue
# 完全参照图片中的字典结构
all_authors[author_id] = {
"name": details.get("name"),
"affiliations": details.get("affiliations", []),
"paper_count": details.get("paperCount", 0),
"citation_count": details.get("citationCount", 0),
"h_index": details.get("hIndex", 0),
"recent_papers": [
{"title": p["title"], "year": p["year"]}
for p in details.get("papers", [])[:3] # 最近3篇论文
]
}
sleep(0.5) # 请求间隔
# 转换为DataFrame并保存
df = pd.DataFrame(all_authors.values())
df.to_csv('enhanced_llm_authors.csv', index=False)
return df
# 执行示例
if __name__ == "__main__":
df = use_semantic_scholar_enhanced()
print(f"获取到 {len(df)} 位作者,包含以下字段:")
print(df.columns)
print("\\n示例数据:")
print(df.head())