[NodeJS] 크롤링을 활용한 스크랩(DB저장)
전문 분야가 아니라 많이 부족합니다.
참조만 부탁 드리며 악용하지 말아주세요.
app.js
var Spider = require('node-spider');
var mysql = require('mysql');
var Entities = require('html-entities').AllHtmlEntities;
entities = new Entities();
var conn = mysql.createConnection({
host :'localhost',
port : 3306,
user : 'DB 아이디',
password : 'DB 비밀번호',
database:'DB 명'
});
conn.connect();
if(!String.prototype.trim) {
String.prototype.trim = function () {
return this.replace(/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, '');
};
}
function addslashes(string) {
return string.replace(/\\/g, '\\\\').
replace(/\u0008/g, '\\b').
replace(/\t/g, '\\t').
replace(/\n/g, '\\n').
replace(/\f/g, '\\f').
replace(/\r/g, '\\r').
replace(/'/g, '\\\'').
replace(/"/g, '\\"');
}
function mysql_real_escape_string (str) {
if (typeof str != 'string')
return str;
return str.replace(/[\0\x08\x09\x1a\n\r"'\\\%]/g, function (char) {
switch (char) {
case "\0":
return "\\0";
case "\x08":
return "\\b";
case "\x09":
return "\\t";
case "\x1a":
return "\\z";
case "\n":
return "\\n";
case "\r":
return "\\r";
case "\"":
case "'":
case "\\":
case "%":
return "\\"+char; // prepends a backslash to backslash, percent,
// and double/single quotes
}
});
}
var spider = new Spider({
// How many requests can be run in parallel
concurrent: 5,
// How long to wait after each request
delay: 0,
// A stream to where internal logs are sent, optional
logs: process.stderr,
// Re-visit visited URLs, false by default
allowDuplicates: false,
// If `true` all queued handlers will be try-catch'd, errors go to `error` callback
catchErrors: true,
// Called when there's an error, throw will be used if none is provided
error: function(err, url) { console.log(err); },
// Called when there are no more requests
done: function() {},
//- All options are passed to `request` module, for example:
headers: { 'user-agent': 'TerrorBoy' },
//encoding: 'utf8'
encoding: null
});
var handleRequest = function(doc) {
// doc.res
var content = doc.$('div.con_inner').text().trim();
var title = doc.$('#head_title').text().trim();
var contentHTML = doc.$('div.con_inner').html();
var member = doc.$('li#info_name').find('span.member').text().trim();
var PageUrl = doc.url;
if(title) title = conn.escape(mysql_real_escape_string(title));
if(member) member = conn.escape(mysql_real_escape_string(member));
if(content) content = conn.escape(mysql_real_escape_string(content));
if(contentHTML) {
contentHTML = entities.decode(contentHTML).trim();
contentHTML = conn.escape(mysql_real_escape_string(contentHTML));
}
if(content) {
// new page crawled
console.log('---시작---');
console.log(PageUrl); // page url
conn.query("insert into `_sc` set `member` = "+member+", `title` = "+title+", `content` = "+content+", `html_content` = "+decodeURIComponent(contentHTML)+", `url` = '"+PageUrl+"', `rdate` = now()");
console.log('---종료---');
}
// uses cheerio, check its docs for more info
doc.$('a').each(function(i, elem) {
// do stuff with element
//var href = elem.attribs.href.split('#')[0];
//var url = doc.resolve(href);
//var href = doc.$(elem).attr('href').split('#')[0];
var href = (doc.$(elem).attr('href')!==undefined?doc.$(elem).attr('href').split('#')[0]:'');
var url = doc.resolve(href);
if(url.indexOf('/cm_free/') > -1) {
// crawl more
spider.queue(url, handleRequest);
}
});
};
// start crawling
spider.queue('http://sir.kr/cm_free', handleRequest);