AO3: Auto-scrape accurate tag usage data from Ao3's tag search!!

Takes a search from tag search and exports those tags + their total usage on ao3 in a new tab!
换行
// ==UserScript==
// @name         AO3: Auto-scrape accurate tag usage data from Ao3's tag search!!
// @description  Takes a search from tag search and exports those tags + their total usage on ao3 in a new tab!
// @version      1.0.2

// @author       owlwinter
// @namespace    N/A
// @license      MIT license

// @match        *://*.archiveofourown.org/tags/search?*
// @grant        none
// ==/UserScript==

(function() {
    'use strict';

    //This script takes a search from tag search and exports the results in a new tab
    //It is!!! Very slow!!! Because I am also scraping to get further information about each tag
    //   ie, how many fics use each tag
    //   (the number displayed via tag search is only the number of works that use that exact tag,
    //   not the number of fics that redirect to that tag)
    // When I say slow, I mean it!!
    // It took me about 4 days to scrape the 271 pages of No Fandom canonical freeforms
    // It takes that long so as to not get us rate limited by the ao3 server gods :)

    //Creating a simple "scrape" button - since this can get someone rate limited,
    // we definitely don't want to have it happen automatically
    const button = document.createElement("button")
    const buttondiv = document.createElement("div")
    buttondiv.append(button)
    document.querySelector("#main > h3").append(buttondiv)

    // called for each tag in the search results list
    // `url` is the url of the tag's works page
    // `tagname` is the name of the tag
    // `doc` is the page we are writing our results to
    const getRealCount = function getRealCount(url, tagname, doc) {
        const xhr = new XMLHttpRequest();
        xhr.onreadystatechange = function xhr_onreadystatechange() {
            if (xhr.readyState == xhr.DONE ) {
                if (xhr.status == 200) {
                    //Grabs the title text including number of works
                    let titletext = xhr.responseXML.documentElement.querySelector("h2").innerText

                    //Trims whitespace
                    titletext = titletext.replace(/\s+/g, ' ').trim()

                    let workcount
                    //If there's multiple pages of results
                    if (xhr.responseXML.documentElement.querySelector(".next") != null) {
                        //Format: 1 - 20 of 39 Works in #ficwip 5k Challenge
                        workcount= titletext.split(" ")[4]
                    } else {
                        //Format: 1 Work in #AksaraAgustus2017
                        //Gets just the first word
                        workcount = titletext.replace(/ .*/,'');
                    }
                    //Writing the tagname & workcount to our new tab
                    doc.write(`<tr><td>${tagname}</td> <td>${workcount}</td> </tr>`)

                    console.log(tagname + " " + workcount)
                } else if (xhr.status == 429) {
                    // ~ao3jail
                    return "Rate limited. Sorry :("
                } else {
                    return "Unexpected error, check the console :("
                    console.log(xhr)
                }
            }
        }
        xhr.open("GET", url)
        xhr.responseType = "document"
        xhr.send()
    }

    //Code to export search results to a table in a new tab
    //Adjusted from the original code here: https://github.com/vaaas/ao3_wrangling_scripts/blob/master/bookmarklets/ao3_tags_as_table.js
    async function async_export(url) {
        //CSS of the page we are exporting our results to
        const export_css = "table { border-collapse: collapse; border: 1px solid black; width: 100%; } td, th { padding: 4px; } tr:nth-child(even) { background: #eee; } th { background: #eFe; }";

        //Some helper functions
        const get = url => new Promise(resolve => {
            const xhr = new XMLHttpRequest();
            xhr.onload = (() => resolve(xhr.responseText));
            xhr.open("GET", url);
            xhr.send()
        });
        const get_next_page_link = doc => doc.querySelector("a[rel='next']");
        const sleep = time => new Promise(resolve => setTimeout(resolve, time));

        let page = 1;
        const parser = new DOMParser();
        const win = window.open();
        const doc = win.document;
        doc.write("<meta charset='utf-8'><table><tr>");
        doc.write("<style>" + export_css + "</style>");
        //This is where you can change the header row text!
        doc.write(["Tag", "Count"].map(x => `<th>${x}</th>`).join(""));
        doc.write("</tr>");

        //For each page of the search results:
        while (url !== null) {
            doc.title = `${page} pages deep`;
            page++;
            const loaded_page = parser.parseFromString(await get(url), "text/html");
            let resulttags = loaded_page.querySelectorAll("li .tag");

            //For each tag on that search results page:
            for (const row of resulttags) {
                //Goes to the works page of that tag so we can grab the accurate tag count!!
                //You can change this to the tag edit page if you want to collect different data
                //Such as the fandoms all those tags are in
                //Just be sure to change what getRealCount() is grabbing if you are doing so
                getRealCount(row.href + "/works", row.innerText, doc);
                //If you want to make this script faster, you can lower the number
                //But this number is sure to not get you rate limited
                await sleep(10000)
            }
            const next_page_link = get_next_page_link(loaded_page);
            url = next_page_link ? next_page_link.href : null;
            //If you want to make this script faster, you can also lower the number
            //But this number is sure to not get you rate limited
            await sleep(10000)
        }
        doc.write("</table>");
        doc.title = "Done!";
        alert("Done!")
    }

    //Just calls the async function when the button is clicked
    const export_results = function export_results(e) {
        e.preventDefault()
        //Only run script on canonical tag search (check via url)
        if (!window.location.href.includes("canonical%5D=T")) {
            window.alert("Please only run this script on a canonical tag search - change the \"Wrangling status\" option to \"Canonical\" and re-search tags!");
            return;
        }
        async_export(location.href)
    }

    // Styles "scrape" button
    button.innerText = "Scrape data"
    button.addEventListener("click", export_results)
    button.style.display = "inline"
    button.style.fontSize = "0.627rem"
    button.style.marginTop = "10px"

    // Your code here...
})();
欢迎访问Greasy Fork镜像