網頁爬蟲是滿常遇到的課題,今天練習一下用這三種語言寫爬蟲
PHP
sunra/php-simple-html-dom-parser
crawler.php
| <?php |
| |
| include 'vendor/autoload.php'; |
| |
| use Sunra\PhpSimple\HtmlDomParser; |
| |
| $url = 'https://www.ptt.cc/bbs/NBA/index.html'; |
| $html = HtmlDomParser::file_get_html($url); |
| $titles = $html->find('div.title'); |
| $file = fopen('php.txt', 'w'); |
| |
| foreach ($titles as $title) { |
| $subject = trim($title->plaintext)."\n"; |
| fwrite($file, $subject); |
| } |
| |
| fclose($file); |
| $html->clear(); |
Python
| |
| |
| import sys |
| import requests |
| from bs4 import BeautifulSoup |
| |
| reload(sys) |
| sys.setdefaultencoding('utf-8') |
| |
| url = 'https://www.ptt.cc/bbs/NBA/index.html' |
| res = requests.get(url) |
| soup = BeautifulSoup(res.text, 'html.parser') |
| titles = soup.find_all('div', 'title') |
| f = open('py.txt', 'w'); |
| |
| for title in titles: |
| f.write('%s\n' % title.text.strip()) |
| |
| f.close() |
Node.js
crawler.js
| const cheerio = require('cheerio'); |
| const request = require('request'); |
| const fs = require('fs'); |
| |
| const url = 'https://www.ptt.cc/bbs/NBA/index.html'; |
| const out = fs.createWriteStream('nodejs.txt'); |
| |
| request(url, (err, res, body) => { |
| if (err) { |
| console.log(err); |
| } |
| |
| const $ = cheerio.load(body); |
| const divs = $('div.title'); |
| |
| divs.each((i, item) => { |
| out.write($(item).text().trim() + '\n'); |
| }); |
| |
| out.end(); |
| }); |
這三個程式執行的結果會得到一樣的內容,當然這是很簡略的範例,照理說要寫一些容錯判斷,但練習用的範例先這樣吧,再附上執行結果比較一下速度
PHP 7.0.22
| $ time php crawler.php |
| |
| real 0m0.614s |
| user 0m0.072s |
| sys 0m0.032s |
| |
| real 0m0.570s |
| user 0m0.056s |
| sys 0m0.032s |
| |
| real 0m0.570s |
| user 0m0.056s |
| sys 0m0.032s |
| |
| real 0m0.515s |
| user 0m0.040s |
| sys 0m0.032s |
| |
| real 0m0.515s |
| user 0m0.040s |
| sys 0m0.032s |
| |
| real 0m0.515s |
| user 0m0.040s |
| sys 0m0.032s |
Node.js v9.0.0
| $ time node crawler.js |
| |
| real 0m1.357s |
| user 0m0.244s |
| sys 0m0.396s |
| |
| real 0m1.744s |
| user 0m0.376s |
| sys 0m0.312s |
| |
| real 0m1.312s |
| user 0m0.272s |
| sys 0m0.380s |
| |
| real 0m1.522s |
| user 0m0.348s |
| sys 0m0.304s |
| |
| real 0m1.369s |
| user 0m0.288s |
| sys 0m0.396s |
| |
| real 0m1.523s |
| user 0m0.388s |
| sys 0m0.344s |
Python 2.7.12
| $ time python crawler.py |
| real 0m0.666s |
| user 0m0.232s |
| sys 0m0.064s |
| |
| real 0m0.682s |
| user 0m0.204s |
| sys 0m0.088s |
| |
| real 0m0.709s |
| user 0m0.240s |
| sys 0m0.064s |
| |
| real 0m0.624s |
| user 0m0.228s |
| sys 0m0.032s |
| |
| real 0m0.669s |
| user 0m0.264s |
| sys 0m0.052s |
| |
| real 0m0.698s |
| user 0m0.260s |
| sys 0m0.068s |
基本上 PHP 跟 Python 都是按下去一下就做完了,Node.js 有等待的感覺,不知道是哪個環節出問題,我有把檔案寫入跟 dom 搜尋的部份關閉,速度沒差多少,所以是 [request][] 套件慢?但網路大家都推 [request][],他應該也是基於 node.js 內建的 http module 上的高級封裝,而且其他人也都有用套件,只能說幾乎我每次測試東西 node.js 數據都會吊車尾,目前還是選擇使用他好用的工具群為主。