// ==UserScript==
// @name 博客园新闻在线词云
// @name:en Cnblogs News Scraper and WordCloud Generator
// @description:en Cnblogs News Scraper,WordCloud Generator
// @name:ar مدونة الأخبار السحابية على الإنترنت
// @description:ar مكشطة أخبار مدونة، مولد السحابة الكلمية
// @name:bg Онлайн облак от думи за новини в Cnblogs
// @description:bg Скрепер за новини от Cnblogs, генератор на облак от думи
// @name:cs Online slovní oblak zpráv Cnblogs
// @description:cs Skreper zpráv Cnblogs, generátor slovního oblaku
// @name:da Cnblogs Nyheder Online Ordfold
// @description:da Cnblogs Nyhedsskraper, Ordfoldsgenerator
// @name:de Cnblogs Nachrichten Online-Wortwolke
// @description:de Cnblogs Nachrichtenscraper, Wortwolkengenerator
// @name:el Διαδικτυακό σύννεφο λέξεων ειδήσεων Cnblogs
// @description:el Ξύστης ειδήσεων Cnblogs, Γεννήτρια σύννεφου λέξεων
// @name:eo Interreta Vortonubo de Novaĵoj de Cnblogs
// @description:eo Skrapilo de Novaĵoj de Cnblogs, Generilo de Vortonubo
// @name:es Nube de palabras en línea de noticias de Cnblogs
// @description:es Raspador de noticias de Cnblogs, Generador de nube de palabras
// @name:fi Cnblogs-uutisten online-sanapilvi
// @description:fi Cnblogs-uutiskaavin, sanapilvigeneraattori
// @name:fr Nuage de mots en ligne des nouvelles de Cnblogs
// @description:fr Scraper de nouvelles Cnblogs, Générateur de nuage de mots
// @name:fr-CA Nuage de mots en ligne des nouvelles de Cnblogs
// @description:fr-CA Grattoir de nouvelles Cnblogs, Générateur de nuage de mots
// @name:he ענן מילים מקוון של חדשות Cnblogs
// @description:he מגרד חדשות Cnblogs, מחולל ענן מילים
// @name:hr Online oblak riječi vijesti Cnblogs
// @description:hr Skreper vijesti Cnblogs, Generator oblaka riječi
// @name:hu Cnblogs Hírek Online Szófelhő
// @description:hu Cnblogs Hírek Kaparó, Szófelhő Generátor
// @name:id Awan Kata Berita Cnblogs Online
// @description:id Pengikis Berita Cnblogs, Pembuat Awan Kata
// @name:it Nuvola di parole online delle notizie di Cnblogs
// @description:it Scraper di notizie Cnblogs, Generatore di nuvole di parole
// @name:ja Cnblogsニュースオンライン単語クラウド
// @description:ja Cnblogsニューススクレーパー、単語クラウドジェネレーター
// @name:ka Cnblogs სიახლეების ონლაინ სიტყვის ღრუბელი
// @description:ka Cnblogs სიახლეების სკრაპერი, სიტყვის ღრუბლის გენერატორი
// @name:ko Cnblogs 뉴스 온라인 워드클라우드
// @description:ko Cnblogs 뉴스 스크레이퍼, 워드클라우드 생성기
// @name:nb Cnblogs Nyheter Online Ordsky
// @description:nb Cnblogs Nyhetsskraper, Ordskygenerator
// @name:nl Cnblogs Nieuws Online Woordwolk
// @description:nl Cnblogs Nieuwsscraper, Woordwolkgenerator
// @name:pl Chmura słów online wiadomości Cnblogs
// @description:pl Skraper wiadomości Cnblogs, Generator chmury słów
// @name:pt-BR Nuvem de palavras online de notícias do Cnblogs
// @description:pt-BR Raspador de notícias do Cnblogs, Gerador de nuvem de palavras
// @name:ro Nor de cuvinte online al știrilor Cnblogs
// @description:ro Scraper de știri Cnblogs, Generator de nori de cuvinte
// @name:ru Онлайн-облако слов новостей Cnblogs
// @description:ru Скрапер новостей Cnblogs, Генератор облака слов
// @name:sk Online oblak slov správ Cnblogs
// @description:sk Skreper správ Cnblogs, Generátor oblaku slov
// @name:sr Онлајн облак речи вести Cnblogs
// @description:sr Скрејпер вести Cnblogs, Генератор облака речи
// @name:sv Cnblogs Nyheter Online Ordmoln
// @description:sv Cnblogs Nyhetsskrapare, Ordmolnsgenerator
// @name:th คลาวด์คำออนไลน์ของข่าว Cnblogs
// @description:th ตัวขูดข่าว Cnblogs, ตัวสร้างคลาวด์คำ
// @name:tr Cnblogs Haberler Çevrimiçi Kelime Bulutu
// @description:tr Cnblogs Haber Kazıyıcı, Kelime Bulutu Oluşturucu
// @name:ug Cnblogs خەۋەرلىرى تور سۆز بۇلۇتى
// @description:ug Cnblogs خەۋەر قىرگۇچى، سۆز بۇلۇتى ياسىغۇچى
// @name:uk Онлайн хмара слів новин Cnblogs
// @description:uk Скрапер новин Cnblogs, Генератор хмари слів
// @name:vi Đám mây từ trực tuyến của tin tức Cnblogs
// @description:vi Công cụ cạo tin tức Cnblogs, Trình tạo đám mây từ
// @name:zh 博客园新闻在线词云
// @description:zh 博客园新闻抓取工具,词云生成器
// @name:zh-CN 博客园新闻在线词云
// @description:zh-CN 博客园新闻抓取工具,词云生成器
// @name:zh-HK 博客園新聞線上詞雲
// @description:zh-HK 博客園新聞抓取工具,詞雲生成器
// @name:zh-SG 博客园新闻在线词云
// @description:zh-SG 博客园新闻抓取工具,词云生成器
// @name:zh-TW 博客園新聞線上詞雲
// @description:zh-TW 博客園新聞抓取工具,詞雲生成器
// @namespace http://tampermonkey.net/
// @version 1.2.1.1
// @description Scrape news from cnblogs and generate word clouds
// @author aspen138
// @icon https://assets.cnblogs.com/favicon.ico
// @match *://news.cnblogs.com/*
// @grant GM_xmlhttpRequest
// @require https://code.jquery.com/jquery-3.6.0.min.js
// @require https://cdnjs.cloudflare.com/ajax/libs/wordcloud2.js/1.1.2/wordcloud2.min.js
// @require https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/segmentit.min.js
// @connect news.cnblogs.com
// @license MIT
// ==/UserScript==
// Acknowledgement: o1-preview
const isConsiderBody=false; // 是否考虑新闻文本内容
function extract_newsIds() {
// Select the `news_list` container
const newsList = document.querySelector('#news_list');
if (newsList) {
// Find all <a> elements with href matching /n/{news_id}/
const newsLinks = newsList.querySelectorAll('a[href^="/n/"][href$="/"]');
// Extract the news_id from each matching link
const newsIds = Array.from(newsLinks).map(link => {
const match = link.getAttribute('href').match(/\/n\/(\d+)\//);
return match ? match[1] : null;
}).filter(Boolean); // Remove null values
return newsIds;
}
};
(function () {
'use strict';
// Function to load external scripts dynamically
function loadScript(url, callback) {
var script = document.createElement('script');
script.src = url;
script.type = 'text/javascript';
script.onload = callback;
document.head.appendChild(script);
}
// Load segmentit script
loadScript('https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/segmentit.min.js', function () {
// Initialize segmentit after the script is loaded
const segmentit = Segmentit.useDefault(new Segmentit.Segment());
// Now start your main script
main(segmentit);
});
function main(segmentit) {
// Updated form HTML with horizontal layout and lower positioning
const newsIds=extract_newsIds();
// Assume `newsIds` might be undefined
const defaultMin = 781100; // Default start news ID
const defaultMax = 781159; // Default end news ID
// Calculate min and max, falling back to defaults if newsIds is undefined or empty
const minNewsId = Array.isArray(newsIds) && newsIds.length > 0 ? Math.min(...newsIds) : defaultMin;
const maxNewsId = Array.isArray(newsIds) && newsIds.length > 0 ? Math.max(...newsIds) : defaultMax;
var formHtml = `
<div id="news-scraper" style="position:fixed; top:50px; right:10px; background-color:#fff; padding:20px; border:1px solid #ccc; z-index:10000; display: flex; flex-direction: column; gap: 5px;">
<h3 style="margin: 0; text-align: center;">News Scraper and WordCloud Generator</h3>
<label style="display: flex; justify-content: space-between; align-items: center;">
Start News ID:
<input type="number" id="start-news-id" value="${minNewsId}" style="margin-left: 10px;" />
</label>
<label style="display: flex; justify-content: space-between; align-items: center;">
End News ID:
<input type="number" id="end-news-id" value="${maxNewsId}" style="margin-left: 10px;" />
</label>
<button id="start-scraping" style="align-self: center; padding: 5px 10px;">Start Scraping</button>
<div id="scraping-status" style="margin-top: 10px; text-align: center;"></div>
</div>
`;
$('body').append(formHtml);
$('#start-scraping').click(function () {
var startId = parseInt($('#start-news-id').val());
var endId = parseInt($('#end-news-id').val());
// Validate input
if (endId < startId) {
alert('End News ID must be greater than or equal to Start News ID');
return;
}
startScraping(startId, endId);
});
async function startScraping(startId, endId) {
var newsIds = [];
if (startId <= endId) {
for (var i = startId; i <= endId; i++) {
newsIds.push(i);
}
} else {
for (var ii = startId; ii >= endId; ii--) {
newsIds.push(ii);
}
}
var totalNews = newsIds.length;
var newsData = [];
var completedRequests = 0;
$('#scraping-status').text('Starting scraping...');
var concurrencyLimit = 1024; // Adjust this number as needed
var queue = newsIds.slice(); // Copy of newsIds
async function worker() {
while (queue.length > 0) {
var newsId = queue.shift();
await fetchNews(newsId).then(function (newsInfo) {
if (newsInfo) {
newsData.push(newsInfo);
}
});
completedRequests++;
$('#scraping-status').text('Scraped ' + completedRequests + ' of ' + totalNews);
}
}
var workers = [];
for (var j = 0; j < concurrencyLimit; j++) {
workers.push(worker());
}
await Promise.all(workers);
// All done
processData(newsData);
}
// Function to fetch a single news page
function fetchNews(newsId) {
return new Promise(function (resolve) {
var url = 'https://news.cnblogs.com/n/' + newsId + '/';
GM_xmlhttpRequest({
method: 'GET',
url: url,
onload: function (response) {
if (response.status === 200) {
var parser = new DOMParser();
var doc = parser.parseFromString(response.responseText, 'text/html');
var newsInfo = getNewsInfo(doc, newsId, url);
resolve(newsInfo);
} else {
resolve(null);
}
},
onerror: function (error) {
resolve(null);
}
});
});
}
// Function to extract news information from the HTML document
function getNewsInfo(doc, newsId, url) {
var title = 'Not Found';
var time_text = 'Not Found';
var views = 'Not Found';
var news_body = 'Not Found';
var news_title_div = doc.querySelector('#news_title');
if (news_title_div) {
var a = news_title_div.querySelector('a');
if (a) {
title = a.textContent.trim();
}
}
var news_info_div = doc.querySelector('#news_info');
if (news_info_div) {
var time_span = news_info_div.querySelector('span.time');
if (time_span) {
time_text = time_span.textContent.trim();
}
var view_span = news_info_div.querySelector('span.view#News_TotalView');
if (view_span) {
views = view_span.textContent.trim();
}
}
var news_body_div = doc.querySelector('#news_body');
if (news_body_div) {
news_body = news_body_div.innerText.trim();
}
return {
news_id: newsId,
title: title,
time: time_text,
views: views,
news_body: news_body,
url: url
};
}
// Function to process the scraped data and generate word clouds
function processData(newsData) {
// Parse time and extract year_month
for (var i = 0; i < newsData.length; i++) {
var item = newsData[i];
var timeStr = item.time; // e.g., "发布于 2023-09-30 12:34"
var dateMatch = timeStr.match(/发布于\s+(\d{4}-\d{2}-\d{2})/);
if (dateMatch) {
item.date = dateMatch[1];
var dateObj = new Date(item.date);
var year = dateObj.getFullYear();
var month = dateObj.getMonth() + 1; // Months are 0-based
item.year_month = year + '-' + (month < 10 ? '0' + month : month);
} else {
item.date = null;
item.year_month = 'Unknown';
}
}
// Group data by year_month
var groupedData = {};
for (var ii = 0; ii < newsData.length; ii++) {
var my_item = newsData[ii];
var key = my_item.year_month;
if (!groupedData[key]) {
groupedData[key] = [];
}
groupedData[key].push(my_item);
}
// For each group, generate word cloud
for (var my_key in groupedData) {
var group = groupedData[key];
var textArray = [];
for (var j = 0; j < group.length; j++) {
var myitem = group[j];
// Combine title and news_body
var text
if (isConsiderBody){
text = item.title + ' ' + item.news_body;
}
else{
text = myitem.title;
}
textArray.push(text);
}
var combinedText = textArray.join(' ');
// Generate word cloud
generateWordCloud(combinedText, key);
}
$('#scraping-status').text('All word clouds generated.');
}
// Function to generate word cloud using wordcloud2.js
function generateWordCloud(text, title) {
// Create a container div
var container = $('<div></div>').css({
'border': '1px solid #ccc',
'margin': '10px',
'padding': '10px'
});
// Add title
var h3 = $('<h3></h3>').text(title);
container.append(h3);
// Create a canvas
var canvas = $('<canvas></canvas>').attr('width', 500).attr('height', 500);
container.append(canvas);
$('#news-scraper').after(container);
// Generate word cloud
WordCloud(canvas[0], {
list: getWordList(text),
gridSize: 10,
weightFactor: 5,
fontFamily: 'Microsoft Yahei, SimHei, Arial, sans-serif',
color: 'random-dark',
backgroundColor: '#fff'
});
}
// Function to segment text and generate word frequency list
function getWordList(text) {
// Use segmentit to segment Chinese text
var segments = segmentit.doSegment(text);
var words = segments.map(function (seg) {
return seg.w;
});
// Count word frequencies
var freqMap = {};
words.forEach(function (word) {
if (word.length > 1) { // Ignore single characters
if (!freqMap[word]) {
freqMap[word] = 0;
}
freqMap[word]++;
}
});
// Convert to list of [word, frequency] pairs
var wordList = [];
for (var word in freqMap) {
wordList.push([word, freqMap[word]]);
}
// Sort by frequency
wordList.sort(function (a, b) {
return b[1] - a[1];
});
return wordList;
}
}
})();