Eza's Universal Scraper

Gather all images from any page, on command

  1. // ==UserScript==
  2. // @name Eza's Universal Scraper
  3. // @namespace https://inkbunny.net/ezalias
  4. // @description Gather all images from any page, on command
  5. // @license MIT
  6. // @license Public domain / no rights reserved
  7. // @include *
  8. // @version 1.8
  9. // @noframes
  10. // @grant GM_registerMenuCommand
  11. // ==/UserScript==
  12.  
  13. // Grab all images and image links - create simple html list of links - insert list at top of page.
  14.  
  15. // My apologies to anyone reading this; it is a hot mess. But published beats perfect.
  16.  
  17. // Aha: there IS multi-splitting, using regexes as the delimiter. E.g. "Hello awesome, world!".split(/[\s,]+/); for splittong on spaces and commas.
  18. // Stop scraping. Use document object model.
  19.  
  20. // Try to find videos, too, like http://www.pornhub.com/view_video.php?viewkey=1098897412 - the mp4's in the source, but not linked except for subscribers
  21.  
  22. // Imgur does some stupid crap with fake loading / unloading. That'd be fine for memory's sake - but the links also disappear. DownThemAll breaks and this only lists presently-onscreen images.
  23. // Images are still under "content" inside <meta property="og:image"> tags.
  24. // Tempted to just split(http) and filter for URLs with image-filetype extensions.
  25. // List Pixiv links because 'select -> open all in tabs' doesn't work anymore. (Eventually fix in Smoothener.)
  26. // Consider setting custom colors, at least for visited links. (Done.)
  27. // Added NewGrounds because their gallery view is sort of terrible.
  28. // For text files: document.body.innerHTML = document.body.innerHTML.split( /[\n ]/ ).filter( s => s.match( '//' ) ).map( s => '<a href=' + s + '>' + s + '</a><br>' )
  29. // Support MP4s.
  30. // I'm considering getting rid of the link whitelist. Linking all image still goes first, and has purpose - but why not support all ordered links in gallery-like sites?
  31. // This thought leads back to old considerations of scraping all my various subscription pages on multiple websites. With text links, it's tractable.
  32. // The main obstacle, then as now, is domain origin policy. It -should- help that a script for this could run on each of those domains.
  33. // Hide visited links? display:none does not work. Neither does visibility:hidden. Uh... huh.
  34. // https://stackoverflow.com/questions/20074015/a-visited-img-display-none - fuck's sake. "For privacy reasons," it's broken on purpose.
  35. // So there's probably no workaround like unlinking those links. Anything that could tell the site which other sites you've visited is a vulnerability. Shoot.
  36. // 'Open unvisited links' would serve the same purpose.
  37. // booru.org
  38. // Highlight MP4s. Give them a special class or some shit.
  39. // This occasionally gets 'script running too long' errors... and on weird sites. Like DuckDuckGo. Which is bizarre, since I don't think it runs anything at all until triggered. Even then there's no live HTML collection left spinning away. They get Array.from'd in a hurry.
  40. // Menuless access a la Gallery Swallower: insert a DOM element in the upper-left corner, thin enough it won't interfere with sensible website design. When clicked it changes classes. Put an interval in this script to check the length of a live HTML collection containing that class. When it's nonzero, clear the interval and show_links().
  41. // Ideally this is a button that slides out or has on-hover text indicating what the hell it does. It'll be useless on a touchscreen, but come on.
  42. // I commented on HFand "Link all images" appeared in my comment text. The button appeared in the comment iframe and I shrugged it off. This is cause to add //@noframes, obviously, but more importantly it makes me wonder what the fuck HF's comment code is doing and whether that's a vulnerability.
  43. // Given the appearance, consider releasing as "Eza's Scraping Notch." But then Universal Scraper is more descriptive.
  44.  
  45. /*
  46. . direct image links from 'eza's universal scraper' are so great on sites like Mastodon. no lag. no endless loading. I want that to work on pixiv, which is all lag and loading, because they use lazy js for fucking everything.
  47. https://i.pximg.net/c/250x250_80_a2/img-master/img/2020/05/17/13/37/33/81630978_p0_square1200.jpg
  48. https://i.pximg.net/img-original/img/2020/05/17/13/37/33/81630978_p0.png
  49. . so I'd have to link manga pages normally, or detect page count... but single images are trivial.
  50. */
  51.  
  52. // Linux Mint addition: oh hey, e621 has trivial thumbnails.
  53. // https://static1.e621.net/data/preview/dc/07/dc07f1901445c35ab83c052493ec39d2.jpg
  54. // https://static1.e621.net/data/dc/07/dc07f1901445c35ab83c052493ec39d2.png
  55. // Fuck, file extension!
  56.  
  57. // https://art.ngfiles.com/medium_views/1464000/1464020_bluebreed_veronica.png?f1602728027
  58. // https://art.ngfiles.com/images/1464000/1464020_bluebreed_veronica.png?f1602727984
  59.  
  60. // Change numbering: build forward list as numbered links / HTML strings, then reverse that list, so reversed numbers count back down.
  61.  
  62. GM_registerMenuCommand( "Link all images at top of page", show_links );
  63.  
  64. // Put button on page, since menu is missing on later userscript plugins
  65. var trigger = document.createElement( 'button' );
  66. // Onclick, change class to some spinner, so it reacts instantly and looks like it's loading. Really the interval is waiting a second.
  67. // html += '<style> .reloader { background-color:#dbd7d8; border-radius: 50%; width: 60px; height: 60px; text-align: center; display: inline-block; border:1px solid #19ab19; cursor:pointer; line-height: 20px; color:#194d19; font-family:Arial; font-size:33px; padding: 10px 10px; text-decoration:none; } .reloader:hover { background-color:#2abd2a; } </style>';
  68. trigger.style = "position: absolute; width: 90px; height: 30px; left:-85px; top: 5px; background-color:#303020; text-align: center; display: inline-block; border:1px solid #8080A0; cursor:pointer; line-height: 20px; color:#8080A0; font-family:Arial; font-size:10px; text-decoration:none; z-index: 1000000; overflow:hidden;" // Getting it to slide onscreen on-hover might require adding a proper 'style' element. :hover is a pseudoselector.
  69. // Aaargh the button is affected by other CSS. Do I have to specify a bunch of useless parameters so they're ignored?
  70. // Might be easier to not have text.
  71. // Can I align it from the right-hand side? I want it mostly offscreen, I don't care what it looks like.
  72. // Durrr set width and height.
  73. trigger.innerText = "Link all images"; // "Link all images" flows onto two lines when using right:99vw. Bleh.
  74. trigger.title = "Link all images at top of page";
  75. // Text is WIP. Ideally work the word "visible" in there, since it's non-obvious.
  76. trigger.className = "ezas_unclicked_button";
  77. //trigger.onclick = function(){ this.innerText='...'; this.className = 'ezas_clicked_button'; } // Immediate visible change, idempotent
  78. trigger.onclick = function(){ this.style = 'display:none;'; this.className = 'ezas_clicked_button'; } // Immediate visible change, idempotent
  79. document.body.appendChild( trigger );
  80.  
  81. // Injecting code into the page is nontrivial - ironically because function.toString is fragile - so just look for a change in the page.
  82. var button_check = document.getElementsByClassName( 'ezas_clicked_button' );
  83. var fake_event = setInterval( function() {
  84. if( button_check.length > 0 ) {
  85. clearInterval( fake_event );
  86. show_links();
  87. }
  88. }, 500 ); // 500 ms is an important threshold for action, and the tiny button instantly disappearing isn't cutting it. I wrote this and I'm tapping my foot.
  89.  
  90.  
  91. function show_links () {
  92. var links = get_links();
  93. var block = new String;
  94. // block += "<style> a { color: #BBA; } a:visited { color: #A1A; } </style>"; // This doesn't work, by the way.
  95. // block += "<style> .scraped a { color: #1BA; filter: drop-shadow( 0 0 3px #111 ); } .scraped a:visited { color: #A1A; } </style>";
  96. block += "<style> .scraped a { color: #1BA; } .scraped a:visited { color: #A1A; } </style>";
  97. // block += "<style> .scraped .universal a { color: #11A; } .scraped .universal a:visited { color: #A1A; } </style>";
  98. // <span style=''>? Or at least span id and then id.a in a <style> thing.
  99. // block += "<style> a { color: #BBBBAA; } a:visited { color: #AA11AA; visibility:hidden; display:none; } </style>"; // What the fuck.
  100. // ##article:has-text(/Promoted/)
  101. block += "<style> a:has-text( /.mp4/ ) { color: #BFA; } a:has-text( /.mp4/ ):visited { color: #F1A; } </style>"; // Nope. Do /original as well once this works.
  102. links.reverse(); // Reverse order. Should probably be an option instead of hardcoded, but this is already a hacky little thing.
  103. block += "<span class='scraped'> "
  104. for( var n = 0; n < links.length; n++ ) {
  105. if( n!= 0 && n % 10 == 0 ) { block += "<br>"; }
  106. block += "" + n + " <a class='universal' style:'display: none' href='" + links[n] + "'>" + links[n] + "</a> <br>\n";
  107. // console.log( links[n] );
  108. }
  109. block += "</span>";
  110. document.body.innerHTML = block + document.body.innerHTML;
  111. }
  112.  
  113. function get_links() {
  114. var urls = new Array;
  115. /*
  116. // Grab links
  117. var links = document.getElementsByTagName( 'a' );
  118. for( var which in links ) { urls.push( "" + links[which] ); }
  119.  
  120. // Grab <meta content="url"> because Imgur
  121. var links = document.getElementsByTagName( 'meta' );
  122. for( var which in links ) { urls.push( "" + links[which].content ); }
  123. */
  124. // Bare image links first
  125. // urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href )
  126. // .filter( u => u.match('.jpg') || u.match('.png') || u.match('.gif') ) );
  127.  
  128. urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href ) );
  129. urls = urls.concat( Array.from( document.getElementsByTagName( 'meta' ) ).map( v => v.content ) ); // Imgur. <meta content="url"> nonsense.
  130. /*
  131. // Filter URL list to exclude non-images
  132. for( var n = urls.length-1; n > 0; n-- ) { // Backwards
  133. var ditch = true;
  134. if( urls[n].indexOf( ".jpg" ) > 0 ) { ditch = false; }
  135. if( urls[n].indexOf( ".jpeg" ) > 0 ) { ditch = false; }
  136. if( urls[n].indexOf( ".png" ) > 0 ) { ditch = false; }
  137. if( urls[n].indexOf( ".gif" ) > 0 ) { ditch = false; }
  138. if( urls[n].indexOf( ".mp3" ) > 0 ) { ditch = false; }
  139. // if( urls[n].indexOf( "/pictures/" ) > 0 ) { ditch = false; } // Hacky HF deal - comment out later
  140. // if( urls[n].indexOf( "?mode=medium" ) > 0 ) { ditch = false; } // Hacky Pixiv deal
  141. if( urls[n].indexOf( "en/artworks/" ) > 0 ) { ditch = false; } // Hacky new Pixiv deal
  142. if( urls[n].match( '/art/view/' ) ) { ditch = false; } // Hacky NewGrounds deal
  143. if( ditch ) { urls.splice( n, 1 ); }
  144. }
  145. */
  146. var whitelist = [ ".jpg", ".jpeg", ".png", ".gif", ".mp3", ".mp4",
  147. "en/artworks/", //Pixiv
  148. "/s/", // IB
  149. "/view/", // FA - obviating NewGrounds, actually - and P34
  150. // "/", // General purpose? Ech, breaks Mastodon. Maybe sort.
  151. "s=view", // Gelbooru
  152. "/pictures/", // HF
  153. "/post/show/", // e296
  154. "/posts/", // Also e296, for pools?
  155. "/art/", // DeviantArt... bluuuh have to exclude #comments
  156. "/artworks/", // Pixiv
  157. "/picture.php", // HA
  158. "artstation.com/projects/", // ArtStation
  159. "/art/view/" ]; // NewGrounds
  160. urls = urls.filter( u => {
  161. return whitelist.map( w => u.match( w ) ? 1 : 0 ).reduce( (a,e) => a+e ); // If any whitelisted item matches this URL, keep this URL.
  162. } )
  163.  
  164. /*
  165. var blacklist = [ "#comments" ];
  166. urls = urls.filter( u => {
  167. // return blacklist.map( w => u.match( w ) ? 0 : 1 ).reduce( (a,e) => a+e );
  168. return u.match( "#comments" ) ? false : true; // Sloppy.
  169. } )
  170. */
  171. urls = urls.filter( u => ! u.match( "#comments" ) ); // Yeah?
  172.  
  173. // Direct image links from thumbnail links.
  174. // let thumbs = urls.filter( u => u.match( 'p0_square' ) ); // Pixiv
  175. // urls = urls.concat( u.map( u => u.replace( 'c/250x250_80_a2/img-master/', 'img-original/' ).replace( '_square1200', '' ) ) );
  176. // let s = u.split('/')
  177. // } ) )
  178. // https://i.pximg.net/c/250x250_80_a2/img-master/img/2020/05/17/13/37/33/81630978_p0_square1200.jpg
  179. // https://i.pximg.net/img-original/img/2020/05/17/13/37/33/81630978_p0.png
  180. // Goddammit - filetypes.
  181. /*
  182. // Add embedded images, unfiltered... because they're images
  183. var srcs = document.getElementsByTagName( 'img' );
  184. // for( var which in srcs ) { urls.push( "" + srcs[which].src ); }
  185. for( var n = 0; n < srcs.length; n++ ) { urls.push( "" + srcs[n].src ); }
  186. */
  187. urls = urls.concat( Array.from( document.getElementsByTagName( 'img' ) ).map( v => v.src ) );
  188. urls = urls.concat( Array.from( document.getElementsByTagName( 'video' ) ).map( v => v.src ) );
  189. urls = urls.concat( Array.from( document.getElementsByTagName( 'source' ) ).map( v => v.src ) ); // Really, <video> tag? Really?
  190. // Aaaargh new Twitter fucking hides images as you scroll.
  191. if( document.domain != "baraag.net" ) { // Completely fucky order on Baraag. No idea why.
  192. urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href )
  193. .filter( u => u.match('.jpg') || u.match('.png') || u.match('.gif') ) );
  194. }
  195.  
  196. // Promote NewGrounds previews - note, probably ignores secondary images
  197. // https://art.ngfiles.com/medium_views/1464000/1464020_bluebreed_veronica.png?f1602728027
  198. // https://art.ngfiles.com/images/1464000/1464020_bluebreed_veronica.png?f1602727984
  199. urls = urls.concat(
  200. urls.filter( u => u.match( 'art.ngfiles.com/medium_views' ) )
  201. .map( u => u.replace( 'medium_views', 'images' ) )
  202. );
  203. // Might be better to redirect, like Twitter and Tumblr URLs. That'd keep the iu_ inline / secondary images in-order.
  204. // Durrrrr just map and replace instead of adding new URLs.
  205. // Thumbnails too?
  206. // <img src="https://art.ngfiles.com/thumbnails/1464000/1464020.png?f1602728033" alt="Veronica">
  207. // https://www.newgrounds.com/art/view/bluebreed/halloween-tron-bonne
  208. // https://art.ngfiles.com/images/1481000/1481891_bluebreed_halloween-tron-bonne.png?f1603898447
  209. /*
  210. urls = urls.concat(
  211. urls.filter( u => u.match( 'art.ngfiles.com/thumbnails' ) )
  212. .map( u => {
  213. u.replace( 'thumbnails', 'images' );
  214. let name = // Nope, need alt-text for this.
  215. } )
  216. );
  217. */
  218.  
  219. // Remove small images?
  220. // urls = urls.filter( u => ! u.match( '/small' ) ); // Baraag
  221.  
  222. // Remove duplicates
  223. for( var n = urls.length-1; n > 1; n-- ) { // Backwards, now
  224. for( var x = n-1; x > 0; x-- ) { // For each array value before N
  225. if( urls[x] == urls[n] ) { urls.splice( x, 1 ); n--; }
  226. }
  227. }
  228.  
  229. /*
  230. var url_set = new Set( urls );
  231. urls = Array.from( url_set );
  232. */
  233.  
  234. urls.push( '----- ----- ----- ----- ----- ----- -----' ); // Visible seperator
  235. urls = urls.concat( Array.from( urls ).reverse() ); // concat is functional and reverse isn't. Fuck Javascript.
  236.  
  237. return urls;
  238. }
  239.  
  240. function get_links3() {
  241. //for (var attrname in obj2) { obj1[attrname] = obj2[attrname]; }
  242. var links = document.getElementsByTagName( 'a' );
  243. for( var which in links ) {
  244. var ditch = true;
  245. if( links[which].href.indexOf( ".jpg" ) > 0 ) { ditch = false; }
  246. if( ditch ) { delete link; }
  247. }
  248. return links;
  249. }
  250.  
  251. function get_links2() {
  252. var srcs = document.getElementsByTagName( 'img' );
  253. var links = new Array;
  254. links = document.getElementsByTagName( 'a' );
  255. for( var n = srcs.length-1; n > 0; n-- ) { // Backwards
  256. var link = links[n].href;
  257. var ditch = true;
  258. if( link.indexOf( ".jpg" ) > 0 ) { ditch = false; }
  259. if( link.indexOf( ".jpeg" ) > 0 ) { ditch = false; }
  260. if( link.indexOf( ".png" ) > 0 ) { ditch = false; }
  261. if( link.indexOf( ".gif" ) > 0 ) { ditch = false; }
  262. // if( ditch ) { links.splice( n, 1 ); }
  263. if( ditch ) { delete links[n]; }
  264. }
  265. //return srcs.concat( links );
  266. return srcs;
  267. }
  268.  
  269. function get_links1 () {
  270. var links = new Array;
  271. // Gather <a> addresses
  272. var hrefs = document.body.innerHTML.split( 'href=' );
  273. for( var n = 0; n < hrefs.length; n++ ) {
  274. var url = hrefs[n].split( /["'>]+/ )[1]; // Terminate on quotes (or brackets, or space)
  275. links.push( url );
  276. }
  277. // Gather <img> sources
  278. var srcs = document.body.innerHTML.split( 'src=' );
  279. for( var n = 0; n < srcs.length; n++ ) {
  280. // var url = srcs[n].split( /["']+/ )[1];
  281. var url = srcs[n].split( /["'>]+/ )[1];
  282. links.push( url );
  283. }
  284. // Remove non-images
  285. for( var n = links.length-1; n > 0; n-- ) { // Backwards, now
  286. var ditch = true;
  287. if( links[n].indexOf( ".jpg" ) > 0 ) { ditch = false; }
  288. if( links[n].indexOf( ".jpeg" ) > 0 ) { ditch = false; }
  289. if( links[n].indexOf( ".png" ) > 0 ) { ditch = false; }
  290. if( links[n].indexOf( ".gif" ) > 0 ) { ditch = false; }
  291. if( ditch ) { links.splice( n, 1 ); }
  292. }
  293. // Remove duplicates
  294. for( var n = links.length-1; n > 1; n-- ) { // Backwards, now
  295. for( var x = n-1; x > 0; x-- ) { // For each array value before N
  296. if( links[x] == links[n] ) { links.splice( x, 1 ); n--; }
  297. }
  298. }
  299. return links;
  300. }
  301.  

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址