Crawler base on SingleFile

Download site in single file automatically

您需要先安装一个扩展,例如 篡改猴Greasemonkey暴力猴,之后才能安装此脚本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安装一个扩展,例如 篡改猴暴力猴,之后才能安装此脚本。

您需要先安装一个扩展,例如 篡改猴Userscripts ,之后才能安装此脚本。

您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。

您需要先安装用户脚本管理器扩展后才能安装此脚本。

(我已经安装了用户脚本管理器,让我安装!)

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展,比如 Stylus,才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

您需要先安装一款用户样式管理器扩展后才能安装此样式。

(我已经安装了用户样式管理器,让我安装!)

// ==UserScript==
// @name              Crawler base on SingleFile
// @author            Mark
// @description       Download site in single file automatically
// @license           MIT
// @version           0.0.21
// @match             https://*/*
// @run-at            document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @require     https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
// @require     https://openuserjs.org/src/libs/sizzle/GM_config.js
// @connect *
// @noframes
// @namespace https://greasyfork.org/users/1106595
// ==/UserScript==
 
const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update";
const PAGE_LOADING_TIME = 7;
const ERROR_RELOAD_TIME = 10;
const ERROR_RELOAD_LONG_TIME = 60;
const NEXT_TASK_WAITING_TIME = 10;
 
const NO_TASK_WAITING_TIME = 90;
const CF_CHALLENGE_WAITING_TIME = 20;
const QUICK_SLEEP_TIME = 5;
const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
const TASK_MAX_RETRY_TIMES = 3;
const TIME_POINT_TYPES = {
  PREPARE_START: "prepareStart",
  TASK_LOADED: "taskLoaded",
  TASK_REPORTED: "taskReported",
  PRESIGN_INDEX: "presignIndex",
  PRESIGN_SINGLEFILE: "presignSinglefile",
  SINGLE_FILE_SUCCESS: "singleFileSuccess",
  INDEX_FILE_UPLOADED: "indexFileUploaded",
  SINGLE_FILE_UPLOADED: "singleFileUploaded",
  VALIDATE_FAILED: "validateFailed",
};
let gmc = new GM_config({
  id: "CrawlerConfig",
  title: "Crawler setting",
  fields: {
    Name: {
      label: "Name",
      type: "text",
    },
    Password: {
      label: "Password",
      type: "text",
    },
    taskInterval: {
      label: "Task Interval (s)",
      type: "int",
      default: NEXT_TASK_WAITING_TIME,
    },
    taskMaxRetryTimes: {
      label: "Task Max Retry Times",
      type: "int",
      default: TASK_MAX_RETRY_TIMES,
    },
    preferServer: {
      label: "Prefer preSign Server",
      type: "text",
    },
    reportServer: {
      label: "Report Server",
      type: "text",
      default: REPORT_ADDRESS,
    },
  },
  events: {
    init: function () {
      // runs after initialization completes
    },
    save: function () {
      // runs after values are saved
      console.log("save", this.get("Name"), this.get("Password"));
      this.close();
    },
  },
});
 
const crawlerUtil = {
  addScript: (url) => {
    const s = document.createElement("script");
    s.src = url;
    s.onerror = (evt) => {
      setTimeout(() => {
        addScript(url);
      }, 2000);
    };
    document.body.append(s);
  },
 
  addScriptByText: async (url, cache = false, retry = 0) => {
    const s = document.createElement("script");
    s.dataset.crawler = "true";
    const scriptCache = (await GM.getValue("scriptCache")) || {};
    if (cache && scriptCache[url]) {
      s.innerHTML = scriptCache[url];
      document.body.append(s);
      return true;
    }
    try {
      const res = await GM.xmlHttpRequest({
        url: url,
        method: "GET",
      });
 
      const text = res.responseText;
      if (cache) {
        scriptCache[url] = text;
        GM.setValue("scriptCache", scriptCache);
      }
      s.innerHTML = text;
      document.body.append(s);
      return true;
    } catch (error) {
      if (retry > 3) {
        return false;
      }
      await sleep(2);
      return await addScriptByText(url, retry + 1);
    }
  },
 
  getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
    const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
    const preSignSevers = configServer.concat([
      "https://electrolyte-brain-minio.deno.dev",
    ]);
    async function getPreSignUrlFromServer(serverIndex = 0) {
      try {
        return await (
          await GM_fetch(
            `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
          )
        ).json();
      } catch (error) {
        if (!preSignSevers[serverIndex + 1]) {
          return { reload: true };
        }
        return await getPreSignUrlFromServer(serverIndex + 1);
      }
    }
 
    const preSignRes = await getPreSignUrlFromServer();
    if (preSignRes.reload) {
      return "RELOAD";
    }
 
    const url = preSignRes?.url;
    return url || null;
  },
 
  uploader: async (url, content) => {
    const mime = "application/gzip"
    const gzip_data = pako.gzip(content, { level: 9 });
    const upload_blob = new Blob([gzip_data], { type: mime });
 
    return await GM.xmlHttpRequest({
      method: "PUT",
      url,
      headers: {
        "Content-Type": mime,
        "Content-Length": upload_blob.size,
      },
      data: upload_blob,
    });
  },
 
  downloadFile: (data, fileName) => {
    const a = document.createElement("a");
    document.body.appendChild(a);
    a.style = "display: none";
    const blob = new Blob([data], {
      type: "application/octet-stream",
    });
    const url = window.URL.createObjectURL(blob);
    a.href = url;
    a.download = fileName;
    a.click();
    window.URL.revokeObjectURL(url);
  },
 
  generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
 
  sleep: (duration) => {
    return new Promise((res, rej) => {
      setTimeout(() => res(), duration * 1000);
    });
  },
};
 
// main function
(function () {
  "use strict";
  const {
    addScript,
    addScriptByText,
    generateClientId,
    uploader,
    downloadFile,
    getPreSignUrl,
    sleep,
  } = crawlerUtil;
 
  const dependenciesInit = async () => {
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
      true
    );
 
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
    );
    await addScriptByText(
      "https://crawal-validator.deno.dev/"
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/npm/[email protected]/dist/pako.min.js"
    );
    return () => {
      document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
        el.parentElement.removeChild(el);
      });
    };
  };
 
  const pureHTMLCleaner = (document) => {
    document.querySelectorAll("script").forEach((el) => {
      el.parentElement.removeChild(el);
    });
    document.querySelectorAll("style").forEach((el) => {
      el.parentElement.removeChild(el);
    });
  };
 
  // Overwrite fetch function to bypass CORS
  window.unsafeWindow.fetch = async (...args) => {
    return await fetch(...args).catch(async (err) => {
      return await GM_fetch(...args);
    });
  };
 
  async function reload(waiting = 60, message = "") {
    console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
    await sleep(waiting);
    location.reload();
  }
 
  function readFile(accept = "", multiple = false) {
    const inputEl = document.createElement("input");
    inputEl.setAttribute("type", "file");
    inputEl.setAttribute("accept", accept);
    inputEl.setAttribute("multiple", !!multiple);
    return new Promise((resolve, reject) => {
      inputEl.addEventListener("change", (e) => {
        resolve(multiple ? inputEl.files : inputEl.files[0]);
        window.removeEventListener("click", onWindowClick, true);
      });
      document.body.append(inputEl);
      inputEl.click();
 
      const onWindowClick = () => {
        if (!inputEl.value) {
          reject(new Error("用户取消选择"));
        }
        window.removeEventListener("click", onWindowClick, true);
      };
      setTimeout(() => {
        window.addEventListener("click", onWindowClick, true);
      }, 100);
    });
  }
 
  function AddImportBtn() {
    const btnWrapImport = document.createElement("div");
    btnWrapImport.id = "CRAWLER_ID";
    btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
    const importBtn = btnWrapImport.querySelector("button");
    importBtn.onclick = async () => {
      if (
        !window.confirm(
          "The data in browser will be clear up. Please make sure you have to do this !!!"
        )
      ) {
        return;
      }
      const file = await readFile(".json");
      const reader = new FileReader();
 
      reader.onload = (event) => {
        const json = JSON.parse(event.target.result);
        // console.log({json}, 'json')
        // this.importFromBackUp.bind(this)(json);
        if (
          json instanceof Array &&
          json.every((item) => item.doi && item.validator)
        ) {
          GM.setValue("tasks", json);
          location.reload();
        } else {
          alert(
            "Please upload json file like [{doi: string, validator: string, ...}]"
          );
        }
      };
 
      reader.readAsText(file);
    };
    document.body.appendChild(btnWrapImport);
    return () => {
      const importBtn = document.getElementById("CRAWLER_ID");
      if (importBtn) {
        importBtn.parentElement.removeChild(importBtn);
      }
    };
  }
 
  GM_registerMenuCommand("Download", async () => {
    const taskData = await GM.getValue("tasks");
    const waitingTasks = taskData.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    const now = new Date();
    downloadFile(
      JSON.stringify(taskData),
      `${now.getFullYear()}-${
        now.getMonth() + 1
      }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
        taskData.length
      }-${taskData.length - waitingTasks.length}.json`
    );
  });
 
  GM_registerMenuCommand("Config", async () => {
    gmc.open();
  });
 
  const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
 
  const prepareNextTask = async (nextDoi) => {
    const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
    if (nextDoi) {
      console.log(
        `%cStart next task ${taskInterval}s later...`,
        printStyle,
        nextDoi
      );
      await sleep(taskInterval);
      const taskData = await GM.getValue("tasks");
      const task = taskData.find((task) => task.doi === nextDoi);
      await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
      location.href = nextDoi;
    } else {
      await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
    }
  };
 
  let lasestTimepoint = 0;
  const saveTaskTimepoint = async (pointName, task, taskData) => {
    if (pointName === TIME_POINT_TYPES.PREPARE_START) {
      task[`timePoint_${pointName}`] = new Date().valueOf()
    }
    else {
      if (lasestTimepoint == 0) {
        lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
      }
      if (lasestTimepoint == 0) {
        task[`timePoint_${pointName}`] = 0;
      } else {
        task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
      }
      lasestTimepoint = new Date().valueOf();
    }
    await GM.setValue("tasks", taskData);
  };
 
  const checkRetry = async (task, taskData, nextDoi) => {
    const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
    const retryTimes = task.retryTimes || 0;
    let result = true;
    if (retryTimes >= taskMaxRetryTimes) {
      console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
      task.validated = false;
      task.updateTime = new Date().valueOf();
      await prepareNextTask(nextDoi);
      result = false;
    } else {
      task.retryTimes = retryTimes + 1;
    }
    await GM.setValue("tasks", taskData);
    return result;
  }
 
  async function start() {
    console.log(new Date());
 
    const importBtnHandler = AddImportBtn();
 
    let clientId = await GM.getValue("clientId");
    if (typeof clientId !== "string" || !clientId) {
      clientId = generateClientId();
      await GM.setValue("clientId", clientId);
    }
 
    // ---------------------------- Script dependencies handler -----------------------------------------------------
    const dependenciesHandler = await dependenciesInit();
 
    if (!singlefile || !singlefile.getPageData) {
      await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
      return;
    }
 
    if (!(validators && DEFAULT_CONFIG)) {
      await reload(
        ERROR_RELOAD_TIME,
        "Can not get validators or DEFAULT_CONFIG"
      );
      return;
    }
 
    // ---------------------------- Get Task -----------------------------------------------------
    const taskData = await GM.getValue("tasks");
    let tasks = taskData || [];
 
    // find task which not downloaded and not validated before
    const waitingTasks = tasks.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    console.log(
      `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
      printStyle,
      tasks
    );
 
    if (!waitingTasks.length) {
      await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
      return;
    }
 
    const invalidatedTasks = tasks.filter((task) => task.validated === false);
    const doneTasks = tasks
      .filter((task) => task.downloaded)
      .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
    const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
    const last24hDoneTasks = doneTasks.filter(
      (task) => task.updateTime > previousDay
    );
 
    const lastDoneTime = new Date(doneTasks[0]?.updateTime);
    const currentTask = waitingTasks[0];
    const nextTask = waitingTasks[1] || {};
    await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
 
    const updateCurrentTask = async (isSuccess) => {
      currentTask.validated = isSuccess;
      currentTask.updateTime = new Date().valueOf();
      await GM.setValue("tasks", tasks);
    };
 
    // ---------------------------- Report progress -----------------------------------------------------
 
    const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS;
    const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
      Speed: ${last24hDoneTasks.length} / last 24h`;
    GM.xmlHttpRequest({
      url: reportUrl,
      method: "POST",
      headers: { "Content-Type": "application/json" },
      data: JSON.stringify({
        account: clientId,
        invalidate_count: invalidatedTasks.length,
        done_count: doneTasks.length,
        queue_count: waitingTasks.length,
        tip: reportTip,
      }),
    })
      .then((res) => {
        console.log("Report successfully", { res });
      })
      .finally(() => {
        saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
      });
 
 
    // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
    await sleep(PAGE_LOADING_TIME);
    if (document.getElementById("challenge-form")) {
      console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
      await sleep(CF_CHALLENGE_WAITING_TIME);
      currentTask.cloudflareBlock = true;
      await updateCurrentTask(false);
      await prepareNextTask(nextTask.doi);
      return;
    }
    // bypass els institution check
    if (document.querySelector('.sec-A #bdd-els-close')) {
      const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
      elsCloseBtn.click();
    }
 
    // ---------------------------- validated task ------------------------------------------------
 
    const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
    const doiFixed = doi.replaceAll("/", "_");
    
    const validator = (document) => {
      const abs_selectors = validators[currentTask.validator]["sel_A"];
      const para_selectors = validators[currentTask.validator]["sel_P"];
      if (abs_selectors.length == 0 && para_selectors.length == 0) {
        return false;
      }
      const absValidated = abs_selectors.length == 0 || abs_selectors.some((selector) => document.querySelector(selector));
      const paraValidated = para_selectors.length == 0 || para_selectors.some((selector) => document.querySelectorAll(selector).length > 0);
      return absValidated && paraValidated;
    }
 
    let name = "";
    let pass = "";
    let preferServer = "";
    try {
      name = gmc.get("Name");
      pass = gmc.get("Password");
      preferServer = gmc.get("preferServer");
      if (!name || !pass) {
        throw new Error();
      }
    } catch (err) {
      console.error(
        `%cMiss name or password. Please input in config panel.`,
        printStyle
      );
      return;
    }
 
    const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer);
    await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
    const singlefileUrl = await getPreSignUrl(
      doiFixed,
      `_.sf.html.gz`,
      name,
      pass,
      preferServer
    );
    await saveTaskTimepoint(
      TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
      currentTask,
      tasks
    );
    if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
      await reload(
        ERROR_RELOAD_LONG_TIME,
        "Minio PreSignUrl error, please check url or account"
      );
      return;
    }
    if (!indexUrl && !singlefileUrl) {
      console.error("%cFile existed!!!", printStyle, currentTask.doi);
      await updateCurrentTask(false);
      await prepareNextTask(nextTask.doi);
      return;
    } else { 
      const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
      const old_singlefileUrl = await getPreSignUrl(
        doiFixed,
        `_.sf.html`,
        name,
        pass,
        preferServer
      );
      if (!old_index && !old_singlefileUrl) {
        console.error("%cFile existed!!!", printStyle, currentTask.doi);
        await updateCurrentTask(false);
        await prepareNextTask(nextTask.doi);
        return;
      }
    } 
 
    // --------------------------- Page validate ------------------------------------------------------
    if (!document.body.textContent.toLowerCase().includes(doi)) {
      console.log(
        `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
        printStyle
      );
      await sleep(QUICK_SLEEP_TIME);
      if(await checkRetry(currentTask, tasks, nextTask.doi)){
        location.href = currentTask.doi;
      }
      return;
    }
    if (validator(document)) {
      console.log(
        "%cValidate successfully! Downloading page...",
        printStyle,
        waitingTasks,
        tasks
      );
      importBtnHandler();
      // repair special page
      if (typeof documentFixer[currentTask.validator] === "function") {
        documentFixer[currentTask.validator](document);
      }
      try {
        const data = await singlefile.getPageData(DEFAULT_CONFIG);
        await saveTaskTimepoint(
          TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
          currentTask,
          tasks
        );
        // downloadFile(data.content, `${doiFixed}.singlefile.html`);
        // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
        if (singlefileUrl) {
          await uploader(singlefileUrl, data.content);
          await saveTaskTimepoint(
            TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
            currentTask,
            tasks
          );
        }
        if (indexUrl) {
          dependenciesHandler();
          pureHTMLCleaner(document);
          await uploader(indexUrl, document.body.parentElement.outerHTML);
          await saveTaskTimepoint(
            TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
            currentTask,
            tasks
          );
        }
        console.log("%cUpload successfully!", printStyle);
        currentTask.downloaded = true;
        await updateCurrentTask(true);
      } catch (error) {
        console.error(error);
        if (await checkRetry(currentTask, tasks, nextTask.doi)) {
          await reload(
            ERROR_RELOAD_TIME,
            `singlefile or upload error! ${currentTask.doi}`
          );
        }
        return;
      }
    } else {
      console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
      await saveTaskTimepoint(
        TIME_POINT_TYPES.VALIDATE_FAILED,
        currentTask,
        tasks
      );
      await updateCurrentTask(false);
    }
 
    // --------------------------- Prepare next task ------------------------------------------------------
    await prepareNextTask(nextTask.doi);
  }
 
  start();
})();