知乎话题内容抓取

可以抓取知乎话题下面的所有回答和评论

  1. // ==UserScript==
  2. // @name 知乎话题内容抓取
  3. // @namespace rock
  4. // @version 1.0.3
  5. // @description 可以抓取知乎话题下面的所有回答和评论
  6. // @license MPL
  7. // @author rock
  8. // @match https://www.zhihu.com/question/**
  9. // @match https://www.zhihu.com/people/**
  10. // @icon 
  11. // @require https://code.jquery.com/jquery-2.1.4.min.js
  12. // @require https://cdn.bootcss.com/blueimp-md5/2.12.0/js/md5.min.js
  13. // @grant GM_xmlhttpRequest
  14. // ==/UserScript==
  15.  
  16. (function() {
  17. 'use strict';
  18.  
  19. // Your code here...
  20.  
  21. var button = document.createElement("button"); //创建一个input对象(提示框按钮)
  22. button.id = "id001";
  23. button.textContent = "开始抓取";
  24. button.style.width = "96px";
  25. button.style.height = "32px";
  26. button.style.align = "center";
  27. button.style.backgroundColor = "#005ce6";
  28. button.style.color = "#fff";
  29. button.style.borderRadius = "3px";
  30. button.style.zIndex=9999;
  31. button.style.position='absolute';
  32. button.style.right='20px';
  33. button.style.top='10px';
  34.  
  35.  
  36. //绑定按键点击功能
  37. button.onclick = function (){
  38. crawlData();
  39. return;
  40. };
  41. //在浏览器控制台可以查看所有函数,ctrl+shift+I 调出控制台,在Console窗口进行实验测试
  42. //box.parentNode.appendChild(button)
  43. document.body.appendChild(button);
  44.  
  45.  
  46. function saveShareContent(content, fileName) {
  47. let downLink = document.createElement('a')
  48. downLink.download = fileName
  49. //字符内容转换为blod地址
  50. let blob = new Blob([content])
  51. downLink.href = URL.createObjectURL(blob)
  52. // 链接插入到页面
  53. document.body.appendChild(downLink)
  54. downLink.click()
  55. // 移除下载链接
  56. document.body.removeChild(downLink)
  57. }
  58.  
  59. var clientHeight = document.body.clientHeight
  60. var textArra = [];
  61. var t=0;
  62. var rickName ="知乎数据爬虫";
  63. var h1 = "未知"
  64.  
  65. function getByClass(oParent,sClass){
  66. var aEle = oParent.getElementsByTagName('*');//获取父级元素下的所有元素
  67. var aResult = new Array();
  68. for(var i =0; i<aEle.length; i++){
  69. if(aEle[i].className == sClass){
  70. aResult.push(aEle[i]);
  71. }
  72. }
  73. return aResult;
  74. }
  75.  
  76. function comment(response,test){
  77.  
  78. var commentStr = '';
  79.  
  80. for (let index = 0; index < response.data.length; index++) {
  81. commentStr +=(response.data[index].name+":"+response.data[index].value + '\n')
  82. }
  83.  
  84.  
  85. var inputs = getByClass(test,'public-DraftStyleDefault-block public-DraftStyleDefault-ltr')
  86. inputs[0].innerText = commentStr
  87. var oks = getByClass(test,'Button CommentEditorV2-singleButton Button--primary Button--blue')
  88. oks[0].click()
  89.  
  90. }
  91.  
  92. function buildData(topic){
  93. var test=document.getElementsByTagName('html')[0]
  94. var textList = test.getElementsByClassName('List-item');
  95.  
  96.  
  97. for (let index = 0; index < textList.length; index++) {
  98. var textstr = textList[index].innerText;
  99. var bottons = getByClass(textList[index],'Button ContentItem-action Button--plain Button--withIcon Button--withLabel')
  100.  
  101.  
  102. var items = getByClass(textList[index],'Button ContentItem-more Button--plain')
  103.  
  104. for(let i = 0;i< items.length;i++){
  105. if( items[i].innerText.indexOf('阅读全文')!=-1 ){
  106. items[i].click()
  107. break;
  108. }
  109. }
  110. var commentsNum = 0
  111.  
  112. for(let i = 0;i< bottons.length;i++){
  113. if( bottons[i].innerText.indexOf('条评论')!=-1 ){
  114. commentsNum = bottons[i].innerText.indexOf('条评论')
  115. bottons[i].click();
  116. break;
  117. }
  118. }
  119.  
  120. if(is_exsit(textstr)){
  121. console.log('重复');
  122. continue;
  123. }
  124.  
  125.  
  126. rickName = getByClass(textList[index],'UserLink-link')[1].innerText
  127. var praiseNum = getByClass(textList[index],'Button VoteButton VoteButton--up')[0].innerText
  128. var text = new Object();
  129. text.content=replaceAll(textstr);
  130. text.topic = topic
  131. text.topicMd5 = md5(topic)
  132. text.baseURI = textList[index].baseURI;
  133. text.nick = rickName;
  134. text.commentsNum = commentsNum;
  135. text.praiseNum = praiseNum;
  136. text.md5 = md5(textstr);
  137. var nestComments = getByClass(textList[0],'NestComment')
  138. if(nestComments!=null || nestComments.length==0){
  139. nestComments = getByClass(test,'NestComment');
  140. }
  141. text.comments=getCommentList(nestComments);
  142. textArra.push(text);
  143.  
  144.  
  145. GM_xmlhttpRequest({
  146. method: "POST",
  147. url: "http://116.205.177.46:8088/zhihu/save",
  148. headers: {
  149. "Content-Type": "application/json"
  150. },
  151. data:JSON.stringify(text),
  152. onload: function(response){
  153. comment(JSON.parse(response.response),textList[index])
  154. },
  155. onerror: function(response){
  156. console.log("请求失败");
  157. }
  158. });
  159. }
  160. window.scroll({ top: t, left: 0, behavior: 'smooth' });t+=clientHeight;
  161. }
  162.  
  163. function replaceAll(str){
  164. str = str.replace(/[ ]|[\r\n]/g,"");
  165. str = str.replace(/[回复踩举报]|[赞回复踩举报]/g,"");
  166. return str
  167. }
  168.  
  169.  
  170. function getCommentList(commentList){
  171. var comments = [];
  172. for (let index = 0; index < commentList.length; index++) {
  173. comments.push(replaceAll(commentList[index].innerText))
  174. }
  175. return comments;
  176. }
  177.  
  178. function is_exsit(str){
  179. for (let index = 0; index < textArra.length; index++) {
  180. if(textArra[index].md5 == md5(str)){
  181. return true;
  182. }
  183. }
  184.  
  185. return false;
  186. }
  187.  
  188. function crawlData(){
  189. var repeat = prompt("请输入你要抓取的条数:","100");
  190.  
  191. var test=document.getElementsByTagName('html')[0];
  192. h1 = getByClass(test,'QuestionHeader-title')[0];
  193. if(h1 == null){
  194. h1 = getByClass(test,'ProfileHeader-name')[0];
  195. }
  196. var allAnswer = getByClass(test,'QuestionMainAction ViewAll-QuestionMainAction')
  197. if(allAnswer.length>0){
  198. allAnswer[0].click();
  199. }
  200.  
  201. var timer = setInterval(function() {
  202. if (repeat <= textArra.length) {
  203. saveShareContent(JSON.stringify(textArra),h1.innerText + ".json");
  204. clearInterval(timer);
  205. } else {
  206. //保存数据
  207.  
  208. buildData(h1.innerText);
  209. }
  210. }, 2000);
  211. }
  212.  
  213.  
  214. })();

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址