{"id":8650,"date":"2025-02-19T06:26:54","date_gmt":"2025-02-18T22:26:54","guid":{"rendered":"https:\/\/fanyiming.life\/?p=8650"},"modified":"2025-12-21T01:03:59","modified_gmt":"2025-12-20T17:03:59","slug":"%e6%96%87%e7%8c%aepdf%e6%94%b9%e5%90%8d%e6%b6%88%e5%8c%96","status":"publish","type":"post","link":"https:\/\/fanyiming.life\/?p=8650","title":{"rendered":"\u6587\u732epdf\u6539\u540d&amp;AI\u6d88\u5316"},"content":{"rendered":"\n<p>\u5206\u4eab\u4e24\u4e2a\u7528\u963f\u91cc\u4e91\u7684\u5927\u6a21\u578bQwen-long\u5bf9\u6587\u732e\u8fdb\u884c\u201c\u6279\u91cfpdf\u6539\u540d\u201d\u548c\u201c\u6279\u91cfAI\u6d88\u5316\u201d\u7684python\u4ee3\u7801\u3002<\/p>\n\n\n\n<p>\u6279\u91cfpdf\u6539\u540d\uff08\u5e74\u4efd_\u6742\u5fd7_\u6807\u9898.pdf\uff09\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import os\nimport time\nfrom pathlib import Path\nfrom openai import OpenAI\n\nclient = OpenAI(\n    api_key=\"sk-xxxxx(\u4f60\u7684\u963f\u91cc\u4e91API-key)xxxxxxxx\",\n    base_url=\"https:\/\/dashscope.aliyuncs.com\/compatible-mode\/v1\",\n)\nfolder_path = Path(r\"D:\\x\\x\u4f60\u653epdf\u7684\u6587\u4ef6\u5939\u8def\u5f84\\x\\\")\n\nfor pdf_file in folder_path.glob(\"*.pdf\"):\n    print(f\"Processing: {pdf_file.name}\")\n    \n    try:\n        # \u6587\u4ef6\u4e0a\u4f20\uff08\u4f7f\u7528create\u65b9\u6cd5+file-extract\u76ee\u7684\uff09\n        uploaded_file = client.files.create(\n            file=pdf_file,\n            purpose=\"file-extract\"\n        )\n        print(f\"Uploaded: {uploaded_file.id}\")\n\n        # \u6784\u9020\u5e26\u6587\u4ef6ID\u7684\u8bf7\u6c42\uff08\u4f18\u5316\u63d0\u793a\u8bcd\uff09\n        response = client.chat.completions.create(\n            model=\"qwen-long\",\n            messages=&#091;\n                {\"role\": \"system\", \"content\": \"\u4f60\u662f\u4e00\u4f4d\u5b66\u672f\u6587\u732e\u5904\u7406\u52a9\u624b\uff0c\u4f60\u7684\u4efb\u52a1\u662f\u4ece\u4e0a\u4f20\u7684PDF\u6587\u4ef6\u4e2d\u63d0\u53d6\u8bba\u6587\u7684\u53d1\u8868\u5e74\u4efd\u3001\u6742\u5fd7\u7b80\u5199\u548c\u6807\u9898\uff0c\u5e76\u6309\u7167\u4ee5\u4e0b\u683c\u5f0f\u8fd4\u56de\uff1a`\u5e74\u4efd_\u6742\u5fd7\u7b80\u5199_\u6807\u9898`\uff0c\u4e0d\u8981\u6dfb\u52a0\u4efb\u4f55\u5176\u4ed6\u5185\u5bb9\u6216\u89e3\u91ca\u3002\"},\n                {\"role\": \"system\", \"content\": f\"fileid:\/\/{uploaded_file.id}\"},\n                {\"role\": \"user\", \"content\": \"\u8bf7\u63d0\u53d6\u8fd9\u7bc7\u8bba\u6587\u7684\u53d1\u8868\u5e74\u4efd\u3001\u6742\u5fd7\u7b80\u5199\u548c\u6807\u9898\uff0c\u5e76\u6309\u7167`\u5e74\u4efd_\u6742\u5fd7\u7b80\u5199_\u6807\u9898`\u7684\u683c\u5f0f\u8fd4\u56de\uff0c\u4e0d\u8981\u6dfb\u52a0\u4efb\u4f55\u5176\u4ed6\u5185\u5bb9\u3002\"}\n            ],\n            stream=False\n        )\n\n        # \u5904\u7406\u7ed3\u679c\uff08\u6dfb\u52a0\u5bb9\u9519\u5224\u65ad\uff09\n        if not response.choices:\n            raise ValueError(\"No response choices available\")\n            \n        result = response.choices&#091;0].message.content\n        if not result.strip():\n            raise ValueError(\"Empty result generated\")\n\n        # \u68c0\u67e5\u7ed3\u679c\u683c\u5f0f\u662f\u5426\u4e3a`\u5e74\u4efd_\u6742\u5fd7\u7b80\u5199_\u6807\u9898`\n        if result.count(\"_\") != 2:\n            raise ValueError(\"Result format is incorrect, expected `\u5e74\u4efd_\u6742\u5fd7\u7b80\u5199_\u6807\u9898`\")\n\n        # \u53bb\u9664\u975e\u6cd5\u5b57\u7b26\u5e76\u9650\u5236\u957f\u5ea6\n        safe_result = \"\".join(c for c in result if c.isalnum() or c in (\" \", \"_\"))&#091;:100]  # \u9632\u6b62\u6587\u4ef6\u540d\u8fc7\u957f\n        safe_result = safe_result.strip()\n\n        # \u91cd\u547d\u540d\u6587\u4ef6\n        new_file_path = folder_path \/ f\"{safe_result}.pdf\"\n        os.rename(pdf_file, new_file_path)\n        print(f\"Renamed: {pdf_file.name} -&gt; {new_file_path.name}\")\n\n        # \u6587\u4ef6\u6e05\u7406\uff08\u6dfb\u52a0\u5220\u9664\u786e\u8ba4\uff09\n        time.sleep(3)\n        deleted_file = client.files.delete(uploaded_file.id)\n        print(deleted_file.model_dump_json())\n\n    except Exception as e:\n        print(f\"Error processing {pdf_file.name}: {str(e)}\")\n        with open(folder_path\/\"error_log.txt\", \"a\") as f:\n            f.write(f\"{time.ctime()} | {pdf_file.name} | {str(e)}\\n\")\n        continue\n\nprint(\"\u5904\u7406\u5b8c\u6210\uff01\")\n<\/code><\/pre>\n\n\n\n<p>\u6279\u91cfAI\u6d88\u5316\uff08\u7ed3\u679c\u4fdd\u5b58\u5230md\u6587\u4ef6\uff09\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import os\nimport time\nfrom pathlib import Path\nfrom openai import OpenAI\n\nclient = OpenAI(\n    api_key=\"sk-xxxxx(\u4f60\u7684\u963f\u91cc\u4e91API-key)xxxxxxxx\",\n    base_url=\"https:\/\/dashscope.aliyuncs.com\/compatible-mode\/v1\",\n)\nfolder_path = Path(r\"D:\\x\\x\u4f60\u653epdf\u7684\u6587\u4ef6\u5939\u8def\u5f84\\x\\\")\n\nfor pdf_file in folder_path.glob(\"*.pdf\"):\n    print(f\"Processing: {pdf_file.name}\")\n    \n    try:\n        # \u6587\u4ef6\u4e0a\u4f20\uff08\u4f7f\u7528create\u65b9\u6cd5+file-extract\u76ee\u7684\uff09\n        uploaded_file = client.files.create(\n            file=pdf_file,\n            purpose=\"file-extract\"\n        )\n        print(f\"Uploaded: {uploaded_file.id}\")\n\n        # \u6784\u9020\u5e26\u6587\u4ef6ID\u7684\u8bf7\u6c42\uff08\u4e24\u4e2asystem\u6d88\u606f\uff09\n        response = client.chat.completions.create(\n            model=\"qwen-long\",\n            messages=&#091;\n                {\"role\": \"system\", \"content\": \"\u4f60\u662f\u4e00\u4f4d\u751f\u7269\u533b\u5b66\u9886\u57df\u7684\u6587\u732e\u603b\u7ed3\u5927\u5e08\uff0c\u6309\u7167\u7528\u6237\u63d0\u51fa\u7684\u683c\u5f0f\u603b\u7ed3\u4e0a\u4f20\u7684\u5b66\u672f\u8bba\u6587\uff0c\u7528markdown\u8f93\u51fa\u5185\u5bb9\u3002\"},\n                {\"role\": \"system\", \"content\": f\"fileid:\/\/{uploaded_file.id}\"},\n                {\"role\": \"user\", \"content\": \"\u6587\u7ae0\u6807\u9898: &#091;\u8bf7\u586b\u5199\u6587\u7ae0\u6807\u9898]\\n\\n\"\n                                           \"\u8fd9\u7bc7\u8bba\u6587\u8bd5\u56fe\u89e3\u51b3\u4ec0\u4e48\u95ee\u9898\uff1f: &#091;\u7b80\u8981\u63cf\u8ff0\u8bba\u6587\u8bd5\u56fe\u89e3\u51b3\u7684\u6838\u5fc3\u95ee\u9898\u6216\u6311\u6218]\\n\\n\"\n                                           \"\u4e3b\u8981\u7814\u7a76\u5185\u5bb9\u548c\u7ed3\u8bba: &#091;\u6982\u62ec\u8bba\u6587\u7684\u4e3b\u8981\u7814\u7a76\u5185\u5bb9\u548c\u5f97\u51fa\u7684\u5173\u952e\u7ed3\u8bba]\\n\\n\"\n                                           \"\u8bba\u6587\u7684\u8d21\u732e: &#091;\u7528\u7b80\u6d01\u7684\u8bed\u8a00\u603b\u7ed3\u8bba\u6587\u7684\u4e3b\u8981\u8d21\u732e]\\n\\n\"\n                                           \"\u7814\u7a76\u610f\u4e49: &#091;\u8bf4\u660e\u8fd9\u9879\u7814\u7a76\u5bf9\u5b66\u672f\u754c\u6216\u5b9e\u9645\u5e94\u7528\u7684\u610f\u4e49]\\n\\n\"\n                                           \"\u672a\u6765\u7684\u7814\u7a76\u65b9\u5411: &#091;\u5217\u51fa\u4f5c\u8005\u63d0\u51fa\u7684\u672a\u6765\u7814\u7a76\u65b9\u5411\u6216\u5efa\u8bae]\\n\\n\"\n                                           \"\u76f8\u5173\u6587\u732e\u5f15\u7528: &#091;\u5217\u51fa\u8bba\u6587\u4e2d\u5f15\u7528\u7684\u4e0eRTCB\u6216RNA ligation\u76f8\u5173\u7684\u6587\u732e]\"}\n            ],\n            stream=False\n        )\n\n        # \u5904\u7406\u7ed3\u679c\uff08\u6dfb\u52a0\u5bb9\u9519\u5224\u65ad\uff09\n        if not response.choices:\n            raise ValueError(\"No response choices available\")\n            \n        summary = response.choices&#091;0].message.content\n        if not summary.strip():\n            raise ValueError(\"Empty summary generated\")\n\n        # \u4fdd\u5b58\u6587\u6863\uff08\u4f7f\u7528\u66f4\u5b89\u5168\u7684\u6587\u4ef6\u540d\u5904\u7406\uff09\n        safe_name = pdf_file.stem.replace(\" \", \"_\")&#091;:50]  # \u9632\u6b62\u6587\u4ef6\u540d\u8fc7\u957f\n        md_path = folder_path \/ f\"{safe_name}_AI\u6d88\u5316.md\"\n        \n        # \u5c06\u603b\u7ed3\u5185\u5bb9\u5199\u5165Markdown\u6587\u4ef6\n        with open(md_path, \"w\", encoding=\"utf-8\") as md_file:\n            md_file.write(summary)\n        print(f\"Saved: {md_path}\")\n\n        # \u6587\u4ef6\u6e05\u7406\uff08\u6dfb\u52a0\u5220\u9664\u786e\u8ba4\uff09\n        time.sleep(3)\n        deleted_file = client.files.delete(uploaded_file.id)\n        print(deleted_file.model_dump_json())\n\n    except Exception as e:\n        print(f\"Error processing {pdf_file.name}: {str(e)}\")\n        with open(folder_path\/\"error_log.txt\", \"a\") as f:\n            f.write(f\"{time.ctime()} | {pdf_file.name} | {str(e)}\\n\")\n        continue\n\nprint(\"\u5904\u7406\u5b8c\u6210\uff01\")\n<\/code><\/pre>\n\n\n\n<p>\u6548\u679c\u9884\u89c8\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"552\" src=\"https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367-1024x552.png?x-oss-process=style\/default\" alt=\"\" class=\"wp-image-8654\" srcset=\"https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367-1024x552.png?x-oss-process=style\/default 1024w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367-300x162.png?x-oss-process=style\/default 300w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367-768x414.png?x-oss-process=style\/default 768w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367-1536x828.png?x-oss-process=style\/default 1536w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219064522367.png?x-oss-process=style\/default 1593w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" \/>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"551\" src=\"https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584-1024x551.png?x-oss-process=style\/default\" alt=\"\" class=\"wp-image-8658\" srcset=\"https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584-1024x551.png?x-oss-process=style\/default 1024w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584-300x161.png?x-oss-process=style\/default 300w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584-768x413.png?x-oss-process=style\/default 768w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584-1536x827.png?x-oss-process=style\/default 1536w, https:\/\/fanym.oss-cn-beijing.aliyuncs.com\/wp-content\/uploads\/2025\/02\/20250219070013584.png?x-oss-process=style\/default 1919w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" \/>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p>68\u7bc7\u91cc3\u7bc7\u5931\u8d25\uff0c\u518d\u6b21\u63d0\u4ea4\u540e\u5931\u8d25\u76843\u7bc7\u91cc2\u7bc7\u6210\u529f\uff0c\u52691\u7bc7\u91cd\u590d\u4e86\u51e0\u6b21\u90fd\u4e0d\u884c\u3002\u6210\u529f\u7387\u53ef\u4ee5\u63a5\u53d7\u3002<\/p>\n<\/blockquote>\n","protected":false},"excerpt":{"rendered":"<p>\u8fd9\u7bc7\u7b14\u8bb0\u5206\u4eab\u4e86\u4e24\u4e2a\u5229\u7528\u963f\u91cc\u4e91Qwen-long\u6a21\u578b\u5904\u7406\u6587\u732e\u7684Python\u811a\u672c\uff1a\u4e00\u4e2a\u7528\u4e8e\u6279\u91cf\u63d0\u53d6PDF\u5143\u6570\u636e\u5e76\u667a\u80fd\u91cd\u547d\u540d\uff08\u683c\u5f0f\u4e3a\u201c\u5e74\u4efd_\u6742\u5fd7_\u6807\u9898\u201d\uff09\uff0c\u53e6\u4e00\u4e2a\u7528\u4e8e\u6279\u91cf\u751f\u6210\u5305\u542b\u7814\u7a76\u95ee\u9898\u3001\u7ed3\u8bba\u3001\u8d21\u732e\u7b49\u7ed3\u6784\u5316\u6458\u8981\u7684Markdown\u6587\u4ef6\u3002<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-8650","post","type-post","status-publish","format-standard","hentry","category-note"],"_links":{"self":[{"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/posts\/8650","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/fanyiming.life\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=8650"}],"version-history":[{"count":2,"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/posts\/8650\/revisions"}],"predecessor-version":[{"id":13880,"href":"https:\/\/fanyiming.life\/index.php?rest_route=\/wp\/v2\/posts\/8650\/revisions\/13880"}],"wp:attachment":[{"href":"https:\/\/fanyiming.life\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=8650"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/fanyiming.life\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=8650"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/fanyiming.life\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=8650"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}