// ==UserScript==
// @name Crawler base on SingleFile
// @author Mark
// @description Download site in single file automatically
// @license MIT
// @version 0.0.11
// @match https://*/*
// @run-at document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @require https://update.gf.qytechs.cn/scripts/483730/1305396/gm-fetch.js
// @connect *
// @noframes
// @namespace https://gf.qytechs.cn/users/1106595
// ==/UserScript==
const addScript = (url) => {
const s = document.createElement("script");
s.src = url;
s.onerror = (evt) => {
setTimeout(() => {addScript(url)}, 2000)
}
document.body.append(s);
};
const generateClientId = () =>
(1e6 * Math.random()).toString(32).replace(".", "");
// main function
(function () {
"use strict";
addScript(
"https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
);
// Overwrite fetch function to bypass CORS
window.unsafeWindow.fetch = async (...args) => {
return await fetch(...args).catch(async (err) => {
return await GM_fetch(...args);
});
};
const downloadFile = (data, fileName) => {
const a = document.createElement("a");
document.body.appendChild(a);
a.style = "display: none";
const blob = new Blob([data], {
type: "application/octet-stream",
});
const url = window.URL.createObjectURL(blob);
a.href = url;
a.download = fileName;
a.click();
window.URL.revokeObjectURL(url);
};
const sleep = (duration) => {
return new Promise((res, rej) => {
setTimeout(() => res(), duration * 1000);
});
};
async function reload(waiting = 60, message = "") {
console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
await sleep(waiting);
location.reload();
}
function readFile(accept = "", multiple = false) {
const inputEl = document.createElement("input");
inputEl.setAttribute("type", "file");
inputEl.setAttribute("accept", accept);
inputEl.setAttribute("multiple", !!multiple);
return new Promise((resolve, reject) => {
inputEl.addEventListener("change", (e) => {
resolve(multiple ? inputEl.files : inputEl.files[0]);
window.removeEventListener("click", onWindowClick, true);
});
document.body.append(inputEl);
inputEl.click();
const onWindowClick = () => {
if (!inputEl.value) {
reject(new Error("用户取消选择"));
}
window.removeEventListener("click", onWindowClick, true);
};
setTimeout(() => {
window.addEventListener("click", onWindowClick, true);
}, 100);
});
}
function AddImportBtn() {
const btnWrapImport = document.createElement("div");
btnWrapImport.id = "CRAWLER_ID";
btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
const importBtn = btnWrapImport.querySelector("button");
importBtn.onclick = async () => {
if (
!window.confirm(
"The data in browser will be clear up. Please make sure you have to do this !!!"
)
) {
return;
}
const file = await readFile(".json");
const reader = new FileReader();
reader.onload = (event) => {
const json = JSON.parse(event.target.result);
// console.log({json}, 'json')
// this.importFromBackUp.bind(this)(json);
if (
json instanceof Array &&
json.every((item) => item.doi && item.validator)
) {
GM.setValue("tasks", json);
location.reload();
} else {
alert(
"Please upload json file like [{doi: string, validator: string, ...}]"
);
}
};
reader.readAsText(file);
};
document.body.appendChild(btnWrapImport);
}
function removeImportBtn() {
const importBtn = document.getElementById("CRAWLER_ID");
if (importBtn) {
importBtn.parentElement.removeChild(importBtn);
}
}
GM_registerMenuCommand("Download", async () => {
const taskData = await GM.getValue("tasks");
const waitingTasks = taskData.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
const now = new Date();
downloadFile(
JSON.stringify(taskData),
`${now.getFullYear()}-${
now.getMonth() + 1
}-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
taskData.length
}-${taskData.length - waitingTasks.length}.json`
);
});
const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
const checker = async () => {
for (let i = 0; i<2; i++) {
try {
if (validators && DEFAULT_CONFIG) {
return true;
}
} catch(err) {} finally{
await sleep(5);
}
}
return false;
}
async function start() {
console.log(new Date());
AddImportBtn();
await sleep(7);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
);
const taskData = await GM.getValue("tasks");
let tasks = taskData || [];
const available = await checker();
if (!available) {
await reload(5, "Can not get validators or DEFAULT_CONFIG");
return;
}
// find task which not downloaded and not validated before
const waitingTasks = tasks.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
console.log(
`%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
printStyle,
tasks
);
// ---------------------------- Report progress -----------------------------------------------------
let clientId = await GM.getValue("clientId");
if (typeof clientId !== "string" || !clientId) {
clientId = generateClientId();
await GM.setValue("clientId", clientId);
}
const invalidatedTasks = tasks.filter((task) => task.validated === false);
const doneTasks = tasks
.filter((task) => task.downloaded)
.sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
const previousDay = new Date().valueOf() - 24*3600*1000;
const last24hDoneTasks = doneTasks.filter(task => task.updateTime > previousDay);
const lastDoneTime = new Date(doneTasks[0]?.updateTime);
const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
Speed: ${last24hDoneTasks.length} / last 24h`;
GM.xmlHttpRequest({
url: "https://crawler-hit.deno.dev/api/update",
method: "POST",
headers: { "Content-Type": "application/json" },
data: JSON.stringify({
account: clientId,
invalidate_count: invalidatedTasks.length,
done_count: doneTasks.length,
queue_count: waitingTasks.length,
tip: reportTip,
}),
}).then((res) => {
window.tts = res;
console.log({ res });
});
if (!waitingTasks.length) {
await reload(90, "No tasks waiting");
return;
}
// -------------------------- Detect Cloudflare challenge -------------------------------------------------------
await sleep(10);
const currentTask = waitingTasks[0];
const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
const validator = validators[currentTask.validator];
if (document.getElementById("challenge-form")) {
console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
await sleep(20);
currentTask.validated = false;
currentTask.cloudflareBlock = true;
}
// --------------------------- Page validate ------------------------------------------------------
if (
!currentTask.cloudflareBlock &&
!document.body.textContent.toLowerCase().includes(doi)
) {
console.log(
`%cURL not match, will redirect to ${currentTask.doi} 5s later`,
printStyle
);
await sleep(5);
location.href = currentTask.doi;
return;
}
if (!currentTask.cloudflareBlock && validator(document)) {
console.log(
"%cValidate successfully! Downloading page...",
printStyle,
waitingTasks,
tasks
);
removeImportBtn();
// repair special page
if (typeof documentFixer[currentTask.validator] === "function") {
documentFixer[currentTask.validator](document);
}
try {
const data = await singlefile.getPageData(DEFAULT_CONFIG);
downloadFile(
data.content,
`${doi.replaceAll("/", "_")}.singlefile.html`
);
downloadFile(
document.body.parentElement.outerHTML,
`${doi.replaceAll("/", "_")}.html`
);
currentTask.downloaded = true;
currentTask.validated = true;
currentTask.updateTime = new Date().valueOf();
} catch (error) {
console.error(error);
await reload(10, `singlefile error! ${currentTask.doi}`);
return;
}
} else {
console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
currentTask.validated = false;
currentTask.updateTime = new Date().valueOf();
}
await GM.setValue("tasks", tasks);
// --------------------------- Prepare next task ------------------------------------------------------
const nextTask = waitingTasks[1];
if (nextTask) {
console.log(
`%cStart next task 10s later...`,
printStyle,
nextTask.doi,
tasks
);
await sleep(10);
location.href = nextTask.doi;
} else {
await reload(60, "No tasks waiting");
}
}
start();
})();