信息抓取

文章、笔记信息抓取

  1. // ==UserScript==
  2. // @name 信息抓取
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.0.4
  5. // @description 文章、笔记信息抓取
  6. // @author dabennn
  7. // @match https://*.xiaohongshu.com/explore/*
  8. // @match https://*.douyin.com/video/*
  9. // @match https://baijiahao.baidu.com/*
  10. // @match https://mp.weixin.qq.com/*
  11. // @match https://*.sohu.com/a/*
  12. // @match https://*.toutiao.com/article/*
  13. // @match https://*.toutiao.com/w/*
  14. // @icon https://www.google.com/s2/favicons?sz=64&domain=github.com
  15. // @license MIT
  16. // @grant GM_notification
  17. // @grant unsafeWindow
  18. // @run-at document-start
  19. // ==/UserScript==
  20.  
  21. (function () {
  22. "use strict";
  23. function formatTimestamp(timestamp) {
  24. const date = new Date(timestamp);
  25. const year = date.getFullYear();
  26. const month = String(date.getMonth() + 1).padStart(2, "0");
  27. const day = String(date.getDate()).padStart(2, "0");
  28. const hours = String(date.getHours()).padStart(2, "0");
  29. const minutes = String(date.getMinutes()).padStart(2, "0");
  30. const seconds = String(date.getSeconds()).padStart(2, "0");
  31. return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
  32. }
  33. function getElementText(domSelector) {
  34. const el = document.querySelector(domSelector);
  35. return el ? el.textContent.trim() : "";
  36. }
  37. function getElementTextByXPath(xpath) {
  38. const el = document.evaluate(
  39. xpath,
  40. document,
  41. null,
  42. XPathResult.FIRST_ORDERED_NODE_TYPE,
  43. null
  44. ).singleNodeValue;
  45. return el ? el.textContent.trim() : "";
  46. }
  47. function createInfoModal(infoText, copyText) {
  48. // 创建一个用于显示信息的div元素作为浮窗
  49. const infoDiv = document.createElement("div");
  50. infoDiv.style.position = "fixed";
  51. infoDiv.style.top = "0";
  52. infoDiv.style.right = "0";
  53. infoDiv.style.backgroundColor = "rgba(255, 255, 255, 0.8)";
  54. infoDiv.style.padding = "10px";
  55. infoDiv.style.border = "1px solid gray";
  56. infoDiv.style.zIndex = "9999";
  57.  
  58. // 创建关闭按钮元素
  59. const closeButton = document.createElement("span");
  60. closeButton.id = "__close_btn__";
  61. closeButton.textContent = "×";
  62. closeButton.style.cursor = "pointer";
  63. closeButton.style.float = "right";
  64. closeButton.style.fontSize = "18px";
  65. closeButton.style.color = "red";
  66.  
  67. // 创建复制按钮元素
  68. const copyButton = document.createElement("span");
  69. copyButton.id = "__copy__";
  70. copyButton.textContent = "复制";
  71. copyButton.style.cursor = "pointer";
  72. copyButton.style.float = "right";
  73. copyButton.style.marginRight = "5px";
  74. copyButton.style.fontSize = "18px";
  75. copyButton.style.color = "blue";
  76.  
  77. // 先将关闭按钮添加到浮窗中
  78. infoDiv.appendChild(closeButton);
  79. // 再将复制按钮添加到浮窗中
  80. infoDiv.appendChild(copyButton);
  81.  
  82. // 将信息拼接成字符串并设置到浮窗的innerHTML中
  83. infoDiv.innerHTML += infoText;
  84.  
  85. // 将浮窗添加到页面的body元素中
  86. document.body.appendChild(infoDiv);
  87.  
  88. document
  89. .querySelector("#__close_btn__")
  90. .addEventListener("click", function () {
  91. infoDiv.parentNode.removeChild(infoDiv);
  92. });
  93.  
  94. document.querySelector("#__copy__").onclick = function () {
  95. navigator.clipboard
  96. .writeText(copyText)
  97. .then(function () {
  98. GM_notification("已复制到剪贴板");
  99. })
  100. .catch(function (err) {
  101. GM_notification("复制失败");
  102. console.error("复制失败:", err);
  103. });
  104. };
  105. }
  106. function createFloatButton() {
  107. const button = document.createElement("div");
  108. button.textContent = "信息抓取";
  109. button.id = "__float_btn__";
  110. button.style.position = "fixed";
  111. button.style.top = "150px";
  112. button.style.right = "0";
  113. button.style.zIndex = "9999";
  114. button.style.backgroundColor = "rgba(255, 100, 100, 0.9)";
  115. button.style.color = "#fff";
  116. button.style.borderRadius = "4px";
  117. button.style.padding = "10px";
  118. button.style.cursor = "pointer";
  119.  
  120. // 关闭按钮
  121. const closeButton = document.createElement("span");
  122. closeButton.textContent = "×";
  123. closeButton.style.position = "absolute";
  124. closeButton.style.top = "-3px";
  125. closeButton.style.right = "2px";
  126. closeButton.style.fontSize = "16px";
  127. closeButton.style.lineHeight = "1";
  128.  
  129. button.appendChild(closeButton);
  130. closeButton.addEventListener("click", (e) => {
  131. e.stopPropagation();
  132. button.parentNode.removeChild(button);
  133. });
  134. document.body.appendChild(button);
  135. }
  136. const formatTextInfo = (texts) =>
  137. texts.join("<br>").replace(/undefined/g, "未获取到");
  138. const formatNumInfo = (texts) => texts.join(" ").replace(/undefined/g, "0");
  139. const formatCopyNum = (num) => (/\d/.test(num) ? num : 0);
  140. const getCopyTime = (time) => time.replace(/[年月]/g, "/").replace(/日/, "");
  141. const getInfoTexts = () => {
  142. let textInfo = "";
  143. let copyInfo = "";
  144. if (window.location.href.includes("xiaohongshu.com/explore")) {
  145. const state = unsafeWindow.__INITIAL_STATE__;
  146. const title = getElementText(".note-content .title");
  147. const author = getElementText(".username");
  148. let time = "";
  149. try {
  150. time = formatTimestamp(
  151. state.note.noteDetailMap[state.note.firstNoteId.value].note
  152. .lastUpdateTime
  153. );
  154. } catch (e) {
  155. console.error(e);
  156. }
  157. const wordCount = getElementText(".note-content").length;
  158. const likeNum = getElementText(
  159. ".interact-container .like-wrapper .count"
  160. );
  161. const commentNum = getElementText(
  162. ".interact-container .chat-wrapper .count"
  163. );
  164. const collectNum = getElementText(
  165. ".interact-container .collect-wrapper .count"
  166. );
  167. textInfo = formatTextInfo([
  168. `标题:${title}`,
  169. `作者:${author}`,
  170. `发布时间:${time}`,
  171. `字数:${wordCount}`,
  172. `点赞数:${likeNum}`,
  173. `评论数:${commentNum}`,
  174. `收藏数:${collectNum}`,
  175. ]);
  176. copyInfo = [
  177. getCopyTime(time),
  178. "小红书",
  179. author,
  180. title,
  181. window.location.href,
  182. formatNumInfo([
  183. `点赞${formatCopyNum(likeNum)}`,
  184. `评论${formatCopyNum(commentNum)}`,
  185. `收藏${formatCopyNum(collectNum)}`,
  186. ]),
  187. wordCount,
  188. ].join("\t");
  189. } else if (window.location.href.includes("douyin.com/video")) {
  190. const title = getElementText(
  191. "h1 span span + span span span span span span"
  192. );
  193. const author = getElementTextByXPath(
  194. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[4]/div/div[1]/div[2]/a/div/span/span/span/span/span/span'
  195. );
  196. const time = getElementTextByXPath(
  197. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[2]/span/text()[2]'
  198. );
  199. const duration = getElementText(".time-duration");
  200. const likeNum = getElementTextByXPath(
  201. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[1]/span'
  202. );
  203. const commentNum = getElementTextByXPath(
  204. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[2]/span'
  205. );
  206. const collectNum = getElementTextByXPath(
  207. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[3]/span'
  208. );
  209. const shareNum = getElementTextByXPath(
  210. '//*[@id="douyin-right-container"]/div[2]/div/div/div[1]/div[3]/div/div[2]/div[1]/div[4]/span'
  211. );
  212. textInfo = formatTextInfo([
  213. `标题:${title}`,
  214. `作者:${author}`,
  215. `发布时间:${time}`,
  216. `时长:${duration}`,
  217. `点赞数:${likeNum}`,
  218. `评论数:${commentNum}`,
  219. `收藏数:${collectNum}`,
  220. `分享数:${shareNum}`,
  221. ]);
  222. copyInfo = [
  223. getCopyTime(time),
  224. "抖音视频",
  225. author,
  226. title,
  227. window.location.href,
  228. formatNumInfo([
  229. `点赞${formatCopyNum(likeNum)}`,
  230. `评论${formatCopyNum(commentNum)}`,
  231. `收藏${formatCopyNum(collectNum)}`,
  232. `分享${formatCopyNum(shareNum)}`,
  233. ]),
  234. duration,
  235. ].join("\t");
  236. } else if (window.location.href.includes("sohu.com/a")) {
  237. const title = getElementText("h1");
  238. const author = getElementText("#user-info h4 a");
  239. const time = getElementText("#news-time");
  240. const wordCount = getElementText("mp-editor").length;
  241. const readNum = getElementText(".read-num em");
  242. const likeNum = getElementText(".like-c .count");
  243. const commentNum = getElementText(".comment-count");
  244. const collectNum = getElementText(".collection-c .count");
  245. const shareNum = getElementText(".share-c .count");
  246. textInfo = formatTextInfo([
  247. `标题:${title}`,
  248. `作者:${author}`,
  249. `发布时间:${time}`,
  250. `字数:${wordCount}`,
  251. `阅读数:${readNum}`,
  252. `点赞数:${likeNum}`,
  253. `评论数:${commentNum}`,
  254. `收藏数:${collectNum}`,
  255. `分享数:${shareNum}`,
  256. ]);
  257. copyInfo = [
  258. getCopyTime(time),
  259. "搜狐",
  260. author,
  261. title,
  262. window.location.href,
  263. formatNumInfo([
  264. `阅读${formatCopyNum(readNum)}`,
  265. `点赞${formatCopyNum(likeNum)}`,
  266. `评论${formatCopyNum(commentNum)}`,
  267. `收藏${formatCopyNum(collectNum)}`,
  268. `分享${formatCopyNum(shareNum)}`,
  269. ]),
  270. wordCount,
  271. ].join("\t");
  272. } else if (window.location.href.includes("toutiao.com/article")) {
  273. const title = getElementText("h1");
  274. const author = getElementText(".article-meta .name");
  275. const time = getElementText(".article-meta span");
  276. const wordCount = getElementText(".tt-article-content").length;
  277. const likeNum = getElementText(".detail-like span");
  278. const commentNum = getElementText(".detail-interaction-comment span");
  279. const collectNum = getElementText(".detail-interaction-collect span");
  280. textInfo = formatTextInfo([
  281. `标题:${title}`,
  282. `作者:${author}`,
  283. `发布时间:${time}`,
  284. `字数:${wordCount}`,
  285. `点赞数:${likeNum}`,
  286. `评论数:${commentNum}`,
  287. `收藏数:${collectNum}`,
  288. ]);
  289. copyInfo = [
  290. getCopyTime(time),
  291. "今日头条",
  292. author,
  293. title,
  294. window.location.href,
  295. formatNumInfo([
  296. `点赞${formatCopyNum(likeNum)}`,
  297. `评论${formatCopyNum(commentNum)}`,
  298. `收藏${formatCopyNum(collectNum)}`,
  299. ]),
  300. wordCount,
  301. ].join("\t");
  302. } else if (window.location.href.includes("toutiao.com/w")) {
  303. const title = getElementText("h1");
  304. const author = getElementText(".desc .name");
  305. const time = getElementText(".abstract .time");
  306. const wordCount = getElementText("article").length;
  307. const likeNum = getElementText(".detail-like span");
  308. const commentNum = getElementText(".detail-interaction-comment span");
  309. const collectNum = getElementText(".detail-interaction-collect span");
  310. textInfo = formatTextInfo([
  311. `标题:${title}`,
  312. `作者:${author}`,
  313. `发布时间:${time}`,
  314. `字数:${wordCount}`,
  315. `点赞数:${likeNum}`,
  316. `评论数:${commentNum}`,
  317. `收藏数:${collectNum}`,
  318. ]);
  319. copyInfo = [
  320. getCopyTime(time),
  321. "今日头条",
  322. author,
  323. title,
  324. window.location.href,
  325. formatNumInfo([
  326. `点赞${formatCopyNum(likeNum)}`,
  327. `评论${formatCopyNum(commentNum)}`,
  328. `收藏${formatCopyNum(collectNum)}`,
  329. ]),
  330. wordCount,
  331. ].join("\t");
  332. } else if (window.location.href.includes("baijiahao.baidu.com")) {
  333. const title = getElementText("#header div");
  334. const author = getElementText("#header [data-testid=author-name]");
  335. const time = getElementText("#header [data-testid=updatetime]");
  336. const wordCount = getElementText("[data-testid=article]").length;
  337. const likeNum = getElementText("[data-testid=like-btn] .interact-desc");
  338. const commentNum = getElementText(
  339. "[data-testid=comment-btn] .interact-desc"
  340. );
  341. const collectNum = getElementText(
  342. "[data-testid=favor-btn] .interact-desc"
  343. );
  344. const shareNum = getElementText("[data-testid=share-btn] .interact-desc");
  345. textInfo = formatTextInfo([
  346. `标题:${title}`,
  347. `作者:${author}`,
  348. `发布时间:${time}`,
  349. `字数:${wordCount}`,
  350. `点赞数:${likeNum}`,
  351. `评论数:${commentNum}`,
  352. `收藏数:${collectNum}`,
  353. `分享数:${shareNum}`,
  354. ]);
  355. copyInfo = [
  356. getCopyTime(time),
  357. "百度",
  358. author,
  359. title,
  360. window.location.href,
  361. formatNumInfo([
  362. `点赞${formatCopyNum(likeNum)}`,
  363. `评论${formatCopyNum(commentNum)}`,
  364. `收藏${formatCopyNum(collectNum)}`,
  365. `分享${formatCopyNum(shareNum)}`,
  366. ]),
  367. wordCount,
  368. ].join("\t");
  369. } else if (window.location.href.includes("mp.weixin.qq.com")) {
  370. const title = getElementText("h1");
  371. const author = getElementText("#js_name");
  372. const time = getElementText("#publish_time");
  373. const wordCount = getElementText("#js_content").length;
  374. textInfo = formatTextInfo([
  375. `标题:${title}`,
  376. `作者:${author}`,
  377. `发布时间:${time}`,
  378. `字数:${wordCount}`,
  379. ]);
  380. copyInfo = [
  381. getCopyTime(time),
  382. "公众号",
  383. author,
  384. title,
  385. window.location.href,
  386. formatNumInfo([`点赞`, `转发`, `喜欢`, `评论`]),
  387. wordCount,
  388. ].join("\t");
  389. }
  390. return {
  391. textInfo,
  392. copyInfo,
  393. };
  394. };
  395.  
  396. window.addEventListener("load", function () {
  397. createFloatButton();
  398. document.querySelector("#__float_btn__").addEventListener("click", () => {
  399. const { textInfo, copyInfo } = getInfoTexts();
  400. createInfoModal(textInfo, copyInfo);
  401. });
  402. });
  403. })();

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址