-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbatch_convert_arxiv.py
More file actions
executable file
·109 lines (86 loc) · 2.83 KB
/
batch_convert_arxiv.py
File metadata and controls
executable file
·109 lines (86 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
"""
Batch convert arXiv PDFs to Markdown with GPU offloading.
Usage:
python batch_convert_arxiv.py
Configuration:
Edit INPUT_DIR and OUTPUT_DIR below to customize paths.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent / "src"))
from nuoyi.api import convert_directory, clear_converter_cache
try:
import torch
except ImportError:
torch = None
def main():
INPUT_DIR = "/home/fred/Documents/参考文献/arXiv_45000/pdfs/arxiv"
OUTPUT_DIR = "/home/fred/Documents/参考文献/arXiv_45000/markdown"
DEVICE = "cuda"
LOW_VRAM = True
RECURSIVE = False
print("=" * 70)
print("arXiv PDF Batch Conversion with GPU Offloading")
print("=" * 70)
print(f"Input: {INPUT_DIR}")
print(f"Output: {OUTPUT_DIR}")
print(f"Device: {DEVICE}")
print(f"Low VRAM Mode: {LOW_VRAM} (Layout GPU, OCR CPU)")
print("=" * 70)
print()
input_path = Path(INPUT_DIR)
if not input_path.exists():
print(f"Error: Input directory not found: {INPUT_DIR}")
sys.exit(1)
output_path = Path(OUTPUT_DIR)
output_path.mkdir(parents=True, exist_ok=True)
if torch and torch.cuda.is_available():
torch.cuda.empty_cache()
print("[Memory] GPU cache cleared")
result = convert_directory(
INPUT_DIR,
output_dir=OUTPUT_DIR,
device=DEVICE,
low_vram=LOW_VRAM,
recursive=RECURSIVE,
)
print()
print("=" * 70)
print("Conversion Summary:")
print("=" * 70)
print(f"Total files: {result.metadata.get('total', 0)}")
print(f"Successful: {result.data.get('success', 0)}")
print(f"Failed: {result.data.get('failed', 0)}")
print("=" * 70)
if result.data.get("failed", 0) > 0:
print("\nFailed files (first 10):")
failed_count = 0
for file_info in result.data.get("files", []):
if not file_info.get("success"):
print(f" ✗ {file_info['file']}")
if file_info.get("error"):
print(f" Error: {file_info['error'][:200]}")
failed_count += 1
if failed_count >= 10:
break
if result.data.get("success", 0) > 0:
print("\nSuccessful files (first 5):")
success_count = 0
for file_info in result.data.get("files", []):
if file_info.get("success"):
print(f" ✓ {file_info['file']}")
success_count += 1
if success_count >= 5:
break
clear_converter_cache()
if torch and torch.cuda.is_available():
torch.cuda.empty_cache()
print()
print("✅ Batch conversion completed!")
if result.data.get("failed", 0) > 0:
sys.exit(1)
else:
sys.exit(0)
if __name__ == "__main__":
main()