Skip to content

Commit ecfe821

Browse files
committed
squash: selected publications changes
1 parent 6d57961 commit ecfe821

12 files changed

Lines changed: 265 additions & 9 deletions

.github/workflows/auto-update.yml

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
name: Regular Auto Update
2+
3+
env:
4+
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
5+
6+
on:
7+
schedule:
8+
# 每两周的周一凌晨2点运行
9+
- cron: '0 2 * * 1/2'
10+
workflow_dispatch:
11+
12+
permissions:
13+
contents: write
14+
pull-requests: write
15+
16+
concurrency:
17+
group: auto-scrape
18+
cancel-in-progress: false
19+
20+
jobs:
21+
scrape-commit:
22+
runs-on: ubuntu-latest
23+
24+
steps:
25+
- name: Checkout repository
26+
uses: actions/checkout@v5
27+
with:
28+
submodules: recursive
29+
30+
- name: Setup Python
31+
uses: actions/setup-python@v6
32+
with:
33+
python-version: '3.11'
34+
35+
- name: Setup Node.js
36+
uses: actions/setup-node@v4
37+
with:
38+
node-version: '20'
39+
40+
- name: Detect scraper directory
41+
run: |
42+
if [ -x "./sp" ]; then
43+
echo "SCRAPER_DIR=." >> "$GITHUB_ENV"
44+
elif [ -x "./dblp-publications-scraper/sp" ]; then
45+
echo "SCRAPER_DIR=./dblp-publications-scraper" >> "$GITHUB_ENV"
46+
else
47+
echo "Cannot find sp launcher. Checked ./sp and ./dblp-publications-scraper/sp"
48+
ls -la
49+
exit 1
50+
fi
51+
52+
- name: Install scraper dependencies
53+
working-directory: ${{ env.SCRAPER_DIR }}
54+
run: |
55+
chmod +x ./sp
56+
./sp i
57+
58+
- name: Run scraper
59+
working-directory: ${{ env.SCRAPER_DIR }}
60+
env:
61+
VENUE_SHORT_LLM_API_KEY: ${{ secrets.VENUE_SHORT_LLM_API_KEY }}
62+
run: |
63+
CONFIG_FILE="config.github.json"
64+
if [ ! -f "$CONFIG_FILE" ]; then
65+
CONFIG_FILE="config.json"
66+
fi
67+
68+
if [ ! -f "$CONFIG_FILE" ]; then
69+
echo "No config file found in $PWD"
70+
ls -la
71+
exit 1
72+
fi
73+
74+
CI_CONFIG_FILE="config.ci.json"
75+
python - <<'PY'
76+
import json
77+
78+
source_file = "config.github.json"
79+
try:
80+
with open(source_file, "r", encoding="utf-8") as f:
81+
cfg = json.load(f)
82+
except FileNotFoundError:
83+
source_file = "config.json"
84+
with open(source_file, "r", encoding="utf-8") as f:
85+
cfg = json.load(f)
86+
87+
cfg["existing_js_path"] = "../collection/auto-collected/auto_collected.js"
88+
89+
with open("config.ci.json", "w", encoding="utf-8") as f:
90+
json.dump(cfg, f, ensure_ascii=False, indent=2)
91+
92+
print(f"Prepared CI config from: {source_file}")
93+
PY
94+
95+
echo "Using config: $CI_CONFIG_FILE"
96+
./sp --config "$CI_CONFIG_FILE"
97+
98+
99+
100+
- name: Check for changes to commit
101+
id: check_changes
102+
run: |
103+
git add -A
104+
if git diff --cached --quiet; then
105+
echo "changes_detected=false" >> $GITHUB_OUTPUT
106+
echo "No changes to commit, skipping PR."
107+
exit 0
108+
else
109+
echo "changes_detected=true" >> $GITHUB_OUTPUT
110+
fi
111+
112+
- name: Create pull request for updates
113+
if: steps.check_changes.outputs.changes_detected == 'true'
114+
uses: peter-evans/create-pull-request@v7
115+
with:
116+
commit-message: "chore: auto update publications"
117+
branch: "chore/auto-scrape-updates"
118+
delete-branch: true
119+
title: "chore: auto update publications"
120+
body: |
121+
This PR is automatically generated by the Auto Scrape workflow.
122+
123+
Changes include:
124+
- refreshed auto-collected publications
125+
- updated bundled JSON assets
126+
labels: |
127+
automation
128+
publications

.github/workflows/bundle.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,4 @@ jobs:
4242
with:
4343
add: public/bundle.json
4444
author_name: GitHub Action
45-
message: bundle JSON asset
46-
47-
48-
49-
45+
message: bundle JSON asset

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "dblp-publications-scraper"]
2+
path = dblp-publications-scraper
3+
url = https://github.com/imethanguo/dblp-publications-scraper.git

.tmp_scan_final.cjs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
4+
const collectionBase = path.resolve('collection');
5+
const autoPath = path.join(collectionBase, 'auto-collected', 'auto_collected.js');
6+
7+
function loadArray(filePath) {
8+
const ext = path.extname(filePath).toLowerCase();
9+
if (ext === '.js') {
10+
delete require.cache[require.resolve(filePath)];
11+
const data = require(filePath);
12+
return Array.isArray(data) ? data : [];
13+
}
14+
const data = JSON.parse(fs.readFileSync(filePath, 'utf8'));
15+
return Array.isArray(data) ? data : [];
16+
}
17+
18+
function normTitle(text) {
19+
return String(text || '')
20+
.trim()
21+
.toLowerCase()
22+
.replace(/[^\w\s]/g, '')
23+
.replace(/\s+/g, ' ');
24+
}
25+
26+
const autoItems = loadArray(autoPath);
27+
const refMap = new Map();
28+
29+
function walk(dir) {
30+
for (const name of fs.readdirSync(dir)) {
31+
const abs = path.join(dir, name);
32+
const rel = path.relative(collectionBase, abs).split(path.sep).join('/');
33+
const st = fs.statSync(abs);
34+
if (st.isDirectory()) {
35+
if (rel === 'auto-collected' || rel.startsWith('auto-collected/')) continue;
36+
walk(abs);
37+
continue;
38+
}
39+
if (!/\.(js|json)$/i.test(name)) continue;
40+
41+
let arr = [];
42+
try {
43+
arr = loadArray(abs);
44+
} catch {
45+
continue;
46+
}
47+
48+
for (const item of arr) {
49+
if (!item || typeof item !== 'object') continue;
50+
const tk = normTitle(item.title);
51+
if (!tk) continue;
52+
if (!refMap.has(tk)) refMap.set(tk, []);
53+
refMap.get(tk).push({ file: rel, title: item.title || '' });
54+
}
55+
}
56+
}
57+
58+
walk(collectionBase);
59+
60+
const duplicates = [];
61+
const missing = {
62+
venue: [],
63+
venueShort: [],
64+
abstract: [],
65+
};
66+
67+
for (const item of autoItems) {
68+
if (!item || typeof item !== 'object') continue;
69+
const title = String(item.title || '').trim() || '(untitled)';
70+
const tk = normTitle(title);
71+
72+
if (tk && refMap.has(tk)) {
73+
duplicates.push({
74+
title,
75+
files: [...new Set(refMap.get(tk).map(x => x.file))],
76+
});
77+
}
78+
79+
if (!String(item.venue || '').trim()) missing.venue.push(title);
80+
if (!String(item.venueShort || '').trim()) missing.venueShort.push(title);
81+
if (!String(item.abstract || '').trim()) missing.abstract.push(title);
82+
}
83+
84+
const out = {
85+
autoCount: autoItems.length,
86+
duplicateCount: duplicates.length,
87+
missingCounts: {
88+
venue: missing.venue.length,
89+
venueShort: missing.venueShort.length,
90+
abstract: missing.abstract.length,
91+
},
92+
duplicates,
93+
missing,
94+
};
95+
96+
console.log(JSON.stringify(out, null, 2));

.tmp_scan_final_result.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"autoCount": 41,
3+
"duplicateCount": 0,
4+
"missingCounts": {
5+
"venue": 2,
6+
"venueShort": 2,
7+
"abstract": 2
8+
},
9+
"duplicates": [],
10+
"missing": {
11+
"venue": [
12+
"Managing Software Supply Chains - Theory and Practice.",
13+
"Proceedings of the 15th Asia-Pacific Symposium on Internetware, Internetware 2024, Macau, SAR, China, July 24-26, 2024."
14+
],
15+
"venueShort": [
16+
"Managing Software Supply Chains - Theory and Practice.",
17+
"Proceedings of the 15th Asia-Pacific Symposium on Internetware, Internetware 2024, Macau, SAR, China, July 24-26, 2024."
18+
],
19+
"abstract": [
20+
"Proceedings of the 15th Asia-Pacific Symposium on Internetware, Internetware 2024, Macau, SAR, China, July 24-26, 2024.",
21+
"RegexScalpel: Regular Expression Denial of Service (ReDoS) Defense by Localize-and-Fix."
22+
]
23+
}
24+
}
44.4 KB
Binary file not shown.

check_missing_after_run.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
const pubs = require('./collection/auto-collected/auto_collected.js');
2+
const miss = pubs.filter((p) => !(((p && p.abstract) || '').toString().trim()));
3+
console.log('total=' + pubs.length);
4+
console.log('missing=' + miss.length);
5+
for (let i = 0; i < miss.length; i++) {
6+
const p = miss[i] || {};
7+
console.log(`${i + 1}. ${(p.title || '').toString()} | ${(p.paperUrl || '').toString()}`);
8+
}

collection/auto_collected.js renamed to collection/auto-collected/auto_collected.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dblp-publications-scraper

dblp-publications-scraper.zip

6.35 MB
Binary file not shown.

0 commit comments

Comments
 (0)