terrorboy
Recent Comments
Recent Posts
04-23 23:03
«   2024/04   »
1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30
Tags
more
Archives
Today
Total
관리 메뉴

z9n

[NodeJS] 크롤링을 활용한 스크랩(DB저장) 본문

NodeJS, NodeWebKit

[NodeJS] 크롤링을 활용한 스크랩(DB저장)

terrorboy 2017. 5. 19. 11:03

[NodeJS] 크롤링을 활용한 스크랩(DB저장)

전문 분야가 아니라 많이 부족합니다.
참조만 부탁 드리며 악용하지 말아주세요.

app.js

var Spider = require('node-spider');
var mysql = require('mysql');
var Entities = require('html-entities').AllHtmlEntities;
entities = new Entities();
var conn = mysql.createConnection({
    host    :'localhost',
    port : 3306,
    user : 'DB 아이디',
    password : 'DB 비밀번호',
    database:'DB 명'
});
conn.connect();



if(!String.prototype.trim) {
  String.prototype.trim = function () {
    return this.replace(/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, '');
  };
}
function addslashes(string) {
    return string.replace(/\\/g, '\\\\').
        replace(/\u0008/g, '\\b').
        replace(/\t/g, '\\t').
        replace(/\n/g, '\\n').
        replace(/\f/g, '\\f').
        replace(/\r/g, '\\r').
        replace(/'/g, '\\\'').
        replace(/"/g, '\\"');
}

function mysql_real_escape_string (str) {
    if (typeof str != 'string')
        return str;

    return str.replace(/[\0\x08\x09\x1a\n\r"'\\\%]/g, function (char) {
        switch (char) {
            case "\0":
                return "\\0";
            case "\x08":
                return "\\b";
            case "\x09":
                return "\\t";
            case "\x1a":
                return "\\z";
            case "\n":
                return "\\n";
            case "\r":
                return "\\r";
            case "\"":
            case "'":
            case "\\":
            case "%":
                return "\\"+char; // prepends a backslash to backslash, percent,
                                  // and double/single quotes
        }
    });
}


var spider = new Spider({
    // How many requests can be run in parallel
    concurrent: 5,
    // How long to wait after each request
    delay: 0,
    // A stream to where internal logs are sent, optional
    logs: process.stderr,
    // Re-visit visited URLs, false by default
    allowDuplicates: false,
    // If `true` all queued handlers will be try-catch'd, errors go to `error` callback
    catchErrors: true,
    // Called when there's an error, throw will be used if none is provided
    error: function(err, url) { console.log(err); },
    // Called when there are no more requests
    done: function() {},
    //- All options are passed to `request` module, for example:
    headers: { 'user-agent': 'TerrorBoy' },
    //encoding: 'utf8'
    encoding: null
});

var handleRequest = function(doc) {

    // doc.res
    var content = doc.$('div.con_inner').text().trim();
    var title = doc.$('#head_title').text().trim();
    var contentHTML = doc.$('div.con_inner').html();
    var member =  doc.$('li#info_name').find('span.member').text().trim();
    var PageUrl = doc.url;
    if(title) title = conn.escape(mysql_real_escape_string(title));
    if(member) member = conn.escape(mysql_real_escape_string(member));
    if(content) content = conn.escape(mysql_real_escape_string(content));
    if(contentHTML) {
        contentHTML = entities.decode(contentHTML).trim();
        contentHTML = conn.escape(mysql_real_escape_string(contentHTML));
    }
    if(content) {

        // new page crawled
        console.log('---시작---');
        console.log(PageUrl); // page url
        conn.query("insert into `_sc` set `member` = "+member+", `title` = "+title+", `content` = "+content+", `html_content` = "+decodeURIComponent(contentHTML)+", `url` = '"+PageUrl+"', `rdate` = now()");
        console.log('---종료---');
    }

    // uses cheerio, check its docs for more info
    doc.$('a').each(function(i, elem) {

        // do stuff with element
        //var href = elem.attribs.href.split('#')[0];
        //var url = doc.resolve(href);
        //var href = doc.$(elem).attr('href').split('#')[0];
        var href = (doc.$(elem).attr('href')!==undefined?doc.$(elem).attr('href').split('#')[0]:'');
        var url = doc.resolve(href);


        if(url.indexOf('/cm_free/') > -1) {

            // crawl more
            spider.queue(url, handleRequest);
        }
    });
};

// start crawling
spider.queue('http://sir.kr/cm_free', handleRequest);


Comments