// ==UserScript==
// @name Crawler base on SingleFile
// @author Mark
// @description Download site in single file automatically
// @license MIT
// @version 0.0.8
// @match https://*/*
// @run-at document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @noframes
// @namespace https://gf.qytechs.cn/users/1106595
// ==/UserScript==
// config for singleFile
const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
const DEFAULT_CONFIG = {
removeHiddenElements: true,
removeUnusedStyles: true,
removeUnusedFonts: true,
removeFrames: false,
compressHTML: true,
compressCSS: false,
loadDeferredImages: true,
loadDeferredImagesMaxIdleTime: 1500,
loadDeferredImagesBlockCookies: false,
loadDeferredImagesBlockStorage: false,
loadDeferredImagesKeepZoomLevel: false,
loadDeferredImagesDispatchScrollEvent: false,
loadDeferredImagesBeforeFrames: false,
filenameTemplate:
"%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
infobarTemplate: "",
includeInfobar: false,
confirmInfobarContent: false,
autoClose: false,
confirmFilename: false,
filenameConflictAction: "uniquify",
filenameMaxLength: 192,
filenameMaxLengthUnit: "bytes",
filenameReplacedCharacters: [
"~",
"+",
"\\\\",
"?",
"%",
"*",
":",
"|",
'"',
"<",
">",
"\x00-\x1f",
"\x7F",
],
filenameReplacementCharacter: "_",
replaceEmojisInFilename: false,
saveFilenameTemplateData: false,
contextMenuEnabled: true,
tabMenuEnabled: true,
browserActionMenuEnabled: true,
shadowEnabled: true,
logsEnabled: true,
progressBarEnabled: true,
maxResourceSizeEnabled: false,
maxResourceSize: 10,
displayInfobar: true,
displayStats: false,
backgroundSave: BACKGROUND_SAVE_SUPPORTED,
defaultEditorMode: "normal",
applySystemTheme: true,
autoSaveDelay: 1,
autoSaveLoad: false,
autoSaveUnload: false,
autoSaveLoadOrUnload: true,
autoSaveDiscard: false,
autoSaveRemove: false,
autoSaveRepeat: false,
autoSaveRepeatDelay: 10,
removeAlternativeFonts: true,
removeAlternativeMedias: true,
removeAlternativeImages: true,
groupDuplicateImages: true,
maxSizeDuplicateImages: 512 * 1024,
saveRawPage: false,
saveToClipboard: false,
addProof: false,
saveToGDrive: false,
saveToDropbox: false,
saveWithWebDAV: false,
webDAVURL: "",
webDAVUser: "",
webDAVPassword: "",
saveToGitHub: false,
githubToken: "",
githubUser: "",
githubRepository: "SingleFile-Archives",
githubBranch: "main",
saveWithCompanion: false,
forceWebAuthFlow: false,
resolveFragmentIdentifierURLs: false,
userScriptEnabled: false,
openEditor: false,
openSavedPage: false,
autoOpenEditor: false,
saveCreatedBookmarks: false,
allowedBookmarkFolders: [],
ignoredBookmarkFolders: [],
replaceBookmarkURL: true,
saveFavicon: true,
includeBOM: false,
warnUnsavedPage: true,
displayInfobarInEditor: false,
compressContent: false,
createRootDirectory: false,
selfExtractingArchive: true,
extractDataFromPage: true,
preventAppendedData: false,
insertTextBody: false,
autoSaveExternalSave: false,
insertMetaNoIndex: false,
insertMetaCSP: true,
passReferrerOnError: false,
password: "",
insertSingleFileComment: true,
removeSavedDate: false,
blockMixedContent: false,
saveOriginalURLs: false,
acceptHeaders: {
font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
stylesheet: "text/css,*/*;q=0.1",
script: "*/*",
document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
video:
"video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
audio:
"audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
},
moveStylesInHead: false,
networkTimeout: 0,
woleetKey: "",
blockImages: false,
blockStylesheets: false,
blockFonts: false,
blockScripts: true,
blockVideos: true,
blockAudios: true,
_migratedTemplateFormat: true,
};
// validator define for different press
const validators = {
1002: (document) =>
(document.querySelector(
".article__body .abstract-group .article-section__abstract .article-section__content"
) || document.querySelector("article .abstract-group")) &&
document.querySelectorAll(
".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
).length > 0,
1016: (document) =>
(document.querySelector("div.abstract.author > div") ||
document.querySelector('[data-left-hand-nav="Summary"]')) &&
(document.querySelectorAll(
"div#body > div:first-child > section[id^=s] p[id^=p]"
).length > 0 ||
document.querySelectorAll(
"div#body > div:first-child :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
).length > 0 ||
document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0 ||
document.querySelectorAll("div#body [id^='sec'] p[id^='par']").length > 0 ),
3390: (document) =>
document.querySelector("#html-abstract .html-p") &&
document.querySelectorAll("article .html-body .html-p").length > 0,
1039: (document) =>
document.querySelector("article .capsule__text") &&
document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
1021: (document) =>
(document.querySelector("p.articleBody_abstractText") || document.querySelector("#specialIssueNotice") || document.querySelector('meta[name="dc.Type"]').content === 'review-article') &&
(document.querySelectorAll("div.NLM_p").length > 0 ||
document.querySelectorAll(".article_content-left > p").length > 0),
1038: (document) =>
(document.querySelector("#Abs1-content") || document.querySelector('article [data-title="Abstract"]')) &&
document.querySelectorAll(
"article .main-content .c-article-section__content > p"
).length > 0,
1007: (document) =>
document.querySelectorAll("#Abs1-content p").length > 0 &&
document.querySelectorAll(".main-content .c-article-section__content > p")
.length > 0,
1088: (document) =>
document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
document.querySelectorAll(`:where(
div[itemprop="articleBody"] > p,
div[itemprop="articleBody"] > .article-text > p,
div[itemprop="articleBody"] > .article-text > .article-text > p,
div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
`).length > 0,
1063: (document) =>
document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
document.querySelectorAll("#ContentTab .article-section-wrapper > p")
.length > 0,
1126: (document) =>
document.querySelectorAll('[role="doc-abstract"] > [role="paragraph"]')
.length > 0 &&
document.querySelectorAll(`#bodymatter [role="paragraph"]`).length > 0,
// Cancel "1155" because of CSP, need to fix "fetch-url2.deno.dev" fetch
// const addScript = async (url) => {
// const s = document.createElement("script");
// const res = await GM.xmlHttpRequest({
// url: url,
// method: "GET",
// });
// const text = res.responseText;
// s.innerHTML = text;
// document.body.append(s);
// };
// 1155: (document) =>
// document.querySelector(".articleBody #abstract") &&
// document.querySelectorAll(".articleBody .xml-content > p:not(#abstract + p)").length > 0,
1074: (document) =>
document.querySelector('.article__sections section:first-child:not(section[id^="cesec"])') &&
document.querySelectorAll('.article__sections section[id^="cesec"] > .section-paragraph').length > 0,
3389: (document) =>
document.querySelector('.JournalAbstract .authors+.notes+p') &&
document.querySelectorAll('.article-container .JournalFullText > p').length > 0,
1186: (document) =>
document.querySelector('[data-title="Abstract"] .c-article-section__content') &&
document.querySelectorAll('main > article > section:not([data-title="Abstract"]):not(#MagazineFulltextArticleBodySuffix ~ section) .c-article-section__content > p').length > 0,
3762: (document) =>
document.querySelector('#articleContent #abstract p') &&
document.querySelectorAll('#articleContent .text-bs > p').length > 0,
1371: (document) =>
document.querySelector('.article-content .abstract-content p') &&
document.querySelectorAll('.article-content #artText div[id^="section"] > p').length > 0,
};
validators["1006"] = validators["1016"];
validators["1149"] = validators["1088"];
const documentFixer = {
1088: (document) => {
const imgs = Array.from(
document.querySelectorAll('main figure img[data-src^="http"]')
);
imgs.forEach((item) => {
item.src = item.dataset.src;
});
},
3389: (document) => {
const imgs = Array.from(
document.querySelectorAll('.article-container .JournalFullText .FigureDesc img[data-src^="http"]')
);
imgs.forEach((item) => {
item.src = item.dataset.src;
});
},
};
documentFixer["1149"] = documentFixer["1088"];
const addScript = (url) => {
const s = document.createElement("script");
s.src = url;
document.body.append(s);
};
const generateClientId = () =>
(1e6 * Math.random()).toString(32).replace(".", "");
// main function
(function () {
"use strict";
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
);
// Overwrite fetch function to bypass CORS
/** The "fetch-url2.deno.dev" code as follow
*
serve((req: Request) => handleRequest(req));
async function handleRequest(req: Request) {
const url = req.url;
const finalUrl = url && url.split("?url=")[1];
if (!finalUrl) {
return new Response(url + " no match '?url='");
}
const res = await fetch(finalUrl);
return new Response(res.body, {
headers: {
...res.headers,
"Access-Control-Allow-Origin": "*",
"Access-Control-Expose-Headers":
"Request-Context,api-supported-versions,Content-Length,Date,Server",
},
});
}
**/
window.unsafeWindow.fetch = async (...args) => {
console.log(args);
if (args.length <= 1) {
return await fetch(...args);
} else {
const [url, ...otherArgs] = args;
return await fetch(...args).catch(async (err) => {
if (url.startsWith("https://fetch-url2.deno.dev")) {
return;
}
return await fetch(
"https://fetch-url2.deno.dev?url=" +
(url.trim().startsWith("http") ? url : `${location.origin}${url}`),
...otherArgs
);
});
}
};
const downloadFile = (data, fileName) => {
const a = document.createElement("a");
document.body.appendChild(a);
a.style = "display: none";
const blob = new Blob([data], {
type: "application/octet-stream",
});
const url = window.URL.createObjectURL(blob);
a.href = url;
a.download = fileName;
a.click();
window.URL.revokeObjectURL(url);
};
const sleep = (duration) => {
return new Promise((res, rej) => {
setTimeout(() => res(), duration * 1000);
});
};
async function reload(waiting = 60, message = "") {
console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
await sleep(waiting);
location.reload();
}
function readFile(accept = "", multiple = false) {
const inputEl = document.createElement("input");
inputEl.setAttribute("type", "file");
inputEl.setAttribute("accept", accept);
inputEl.setAttribute("multiple", !!multiple);
return new Promise((resolve, reject) => {
inputEl.addEventListener("change", (e) => {
resolve(multiple ? inputEl.files : inputEl.files[0]);
window.removeEventListener("click", onWindowClick, true);
});
document.body.append(inputEl);
inputEl.click();
const onWindowClick = () => {
if (!inputEl.value) {
reject(new Error("用户取消选择"));
}
window.removeEventListener("click", onWindowClick, true);
};
setTimeout(() => {
window.addEventListener("click", onWindowClick, true);
}, 100);
});
}
function AddImportBtn() {
const btnWrapImport = document.createElement("div");
btnWrapImport.id = "CRAWLER_ID";
btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
const importBtn = btnWrapImport.querySelector("button");
importBtn.onclick = async () => {
if (
!window.confirm(
"The data in browser will be clear up. Please make sure you have to do this !!!"
)
) {
return;
}
const file = await readFile(".json");
const reader = new FileReader();
reader.onload = (event) => {
const json = JSON.parse(event.target.result);
// console.log({json}, 'json')
// this.importFromBackUp.bind(this)(json);
if (
json instanceof Array &&
json.every((item) => item.doi && item.validator)
) {
GM.setValue("tasks", json);
location.reload();
} else {
alert(
"Please upload json file like [{doi: string, validator: string, ...}]"
);
}
};
reader.readAsText(file);
};
document.body.appendChild(btnWrapImport);
}
function removeImportBtn() {
const importBtn = document.getElementById("CRAWLER_ID");
if (importBtn) {
importBtn.parentElement.removeChild(importBtn);
}
}
GM_registerMenuCommand("Download", async () => {
const taskData = await GM.getValue("tasks");
const waitingTasks = taskData.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
const now = new Date();
downloadFile(
JSON.stringify(taskData),
`${now.getFullYear()}-${
now.getMonth() + 1
}-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
taskData.length
}-${taskData.length - waitingTasks.length}.json`
);
});
const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
async function start() {
console.log(new Date());
AddImportBtn();
await sleep(7);
addScript(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
);
const taskData = await GM.getValue("tasks");
let tasks = taskData || [];
// find task which not downloaded and not validated before
const waitingTasks = tasks.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
console.log(
`%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
printStyle,
tasks
);
// ---------------------------- Report progress -----------------------------------------------------
let clientId = await GM.getValue("clientId");
if (typeof clientId !== "string" || !clientId) {
clientId = generateClientId();
await GM.setValue("clientId", clientId);
}
const invalidatedTasks = tasks.filter((task) => task.validated === false);
const doneTasks = tasks
.filter((task) => task.downloaded)
.sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
const previousDay = new Date().valueOf() - 24*3600*1000;
const last24hDoneTasks = doneTasks.filter(task => task.updateTime > previousDay);
const lastDoneTime = new Date(doneTasks[0]?.updateTime);
const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
Speed: ${last24hDoneTasks.length} / last 24h`;
GM.xmlHttpRequest({
url: "https://crawler-hit.deno.dev/api/update",
method: "POST",
headers: { "Content-Type": "application/json" },
data: JSON.stringify({
account: clientId,
invalidate_count: invalidatedTasks.length,
done_count: doneTasks.length,
queue_count: waitingTasks.length,
tip: reportTip,
}),
}).then((res) => {
window.tts = res;
console.log({ res });
});
if (!waitingTasks.length) {
await reload(90, "No tasks waiting");
return;
}
// -------------------------- Detect Cloudflare challenge -------------------------------------------------------
await sleep(10);
const currentTask = waitingTasks[0];
const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
const validator = validators[currentTask.validator];
if (document.getElementById("challenge-form")) {
console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
await sleep(20);
currentTask.validated = false;
currentTask.cloudflareBlock = true;
}
// --------------------------- Page validate ------------------------------------------------------
if (
!currentTask.cloudflareBlock &&
!document.body.textContent.toLowerCase().includes(doi)
) {
console.log(
`%cURL not match, will redirect to ${currentTask.doi} 5s later`,
printStyle
);
await sleep(5);
location.href = currentTask.doi;
return;
}
if (!currentTask.cloudflareBlock && validator(document)) {
console.log(
"%cValidate successfully! Downloading page...",
printStyle,
waitingTasks,
tasks
);
removeImportBtn();
// repair special page
if (typeof documentFixer[currentTask.validator] === "function") {
documentFixer[currentTask.validator](document);
}
try {
const data = await singlefile.getPageData(DEFAULT_CONFIG);
downloadFile(
data.content,
`${doi.replaceAll("/", "_")}.singlefile.html`
);
downloadFile(
document.body.parentElement.outerHTML,
`${doi.replaceAll("/", "_")}.html`
);
currentTask.downloaded = true;
currentTask.validated = true;
currentTask.updateTime = new Date().valueOf();
} catch (error) {
console.error(error);
await reload(10, `singlefile error! ${currentTask.doi}`);
return;
}
} else {
console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
currentTask.validated = false;
}
await GM.setValue("tasks", tasks);
// --------------------------- Prepare next task ------------------------------------------------------
const nextTask = waitingTasks[1];
if (nextTask) {
console.log(
`%cStart next task 10s later...`,
printStyle,
nextTask.doi,
tasks
);
await sleep(10);
location.href = nextTask.doi;
} else {
await reload(60, "No tasks waiting");
}
}
start();
})();