文献pdf改名&AI消化
分享两个用阿里云的大模型Qwen-long对文献进行"批量pdf改名"和"批量AI消化"的python代码。
批量pdf改名(年份_杂志_标题.pdf):
import os
import time
from pathlib import Path
from openai import OpenAI
client = OpenAI(
api_key="sk-xxxxx(你的阿里云API-key)xxxxxxxx",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
folder_path = Path(r"D:\x\x你放pdf的文件夹路径\x\")
for pdf_file in folder_path.glob("*.pdf"):
print(f"Processing: {pdf_file.name}")
try:
# 文件上传(使用create方法+file-extract目的)
uploaded_file = client.files.create(
file=pdf_file,
purpose="file-extract"
)
print(f"Uploaded: {uploaded_file.id}")
# 构造带文件ID的请求(优化提示词)
response = client.chat.completions.create(
model="qwen-long",
messages=[
{"role": "system", "content": "你是一位学术文献处理助手,你的任务是从上传的PDF文件中提取论文的发表年份、杂志简写和标题,并按照以下格式返回:`年份_杂志简写_标题`,不要添加任何其他内容或解释。"},
{"role": "system", "content": f"fileid://{uploaded_file.id}"},
{"role": "user", "content": "请提取这篇论文的发表年份、杂志简写和标题,并按照`年份_杂志简写_标题`的格式返回,不要添加任何其他内容。"}
],
stream=False
)
# 处理结果(添加容错判断)
if not response.choices:
raise ValueError("No response choices available")
result = response.choices[0].message.content
if not result.strip():
raise ValueError("Empty result generated")
# 检查结果格式是否为`年份_杂志简写_标题`
if result.count("_") != 2:
raise ValueError("Result format is incorrect, expected `年份_杂志简写_标题`")
# 去除非法字符并限制长度
safe_result = "".join(c for c in result if c.isalnum() or c in (" ", "_"))[:100] # 防止文件名过长
safe_result = safe_result.strip()
# 重命名文件
new_file_path = folder_path / f"{safe_result}.pdf"
os.rename(pdf_file, new_file_path)
print(f"Renamed: {pdf_file.name} -> {new_file_path.name}")
# 文件清理(添加删除确认)
time.sleep(3)
deleted_file = client.files.delete(uploaded_file.id)
print(deleted_file.model_dump_json())
except Exception as e:
print(f"Error processing {pdf_file.name}: {str(e)}")
with open(folder_path/"error_log.txt", "a") as f:
f.write(f"{time.ctime()} | {pdf_file.name} | {str(e)}\n")
continue
print("处理完成!")
批量AI消化(结果保存到md文件):
import os
import time
from pathlib import Path
from openai import OpenAI
client = OpenAI(
api_key="sk-xxxxx(你的阿里云API-key)xxxxxxxx",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
folder_path = Path(r"D:\x\x你放pdf的文件夹路径\x\")
for pdf_file in folder_path.glob("*.pdf"):
print(f"Processing: {pdf_file.name}")
try:
# 文件上传(使用create方法+file-extract目的)
uploaded_file = client.files.create(
file=pdf_file,
purpose="file-extract"
)
print(f"Uploaded: {uploaded_file.id}")
# 构造带文件ID的请求(两个system消息)
response = client.chat.completions.create(
model="qwen-long",
messages=[
{"role": "system", "content": "你是一位生物医学领域的文献总结大师,按照用户提出的格式总结上传的学术论文,用markdown输出内容。"},
{"role": "system", "content": f"fileid://{uploaded_file.id}"},
{"role": "user", "content": "文章标题: [请填写文章标题]\n\n"
"这篇论文试图解决什么问题?: [简要描述论文试图解决的核心问题或挑战]\n\n"
"主要研究内容和结论: [概括论文的主要研究内容和得出的关键结论]\n\n"
"论文的贡献: [用简洁的语言总结论文的主要贡献]\n\n"
"研究意义: [说明这项研究对学术界或实际应用的意义]\n\n"
"未来的研究方向: [列出作者提出的未来研究方向或建议]\n\n"
"相关文献引用: [列出论文中引用的与RTCB或RNA ligation相关的文献]"}
],
stream=False
)
# 处理结果(添加容错判断)
if not response.choices:
raise ValueError("No response choices available")
summary = response.choices[0].message.content
if not summary.strip():
raise ValueError("Empty summary generated")
# 保存文档(使用更安全的文件名处理)
safe_name = pdf_file.stem.replace(" ", "_")[:50] # 防止文件名过长
md_path = folder_path / f"{safe_name}_AI消化.md"
# 将总结内容写入Markdown文件
with open(md_path, "w", encoding="utf-8") as md_file:
md_file.write(summary)
print(f"Saved: {md_path}")
# 文件清理(添加删除确认)
time.sleep(3)
deleted_file = client.files.delete(uploaded_file.id)
print(deleted_file.model_dump_json())
except Exception as e:
print(f"Error processing {pdf_file.name}: {str(e)}")
with open(folder_path/"error_log.txt", "a") as f:
f.write(f"{time.ctime()} | {pdf_file.name} | {str(e)}\n")
continue
print("处理完成!")
效果预览:


68篇里3篇失败,再次提交后失败的3篇里2篇成功,剩1篇重复了几次都不行。成功率可以接受。
感谢樊总分享👍看起来很实用的功能