博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
淘宝爬虫
阅读量:6174 次
发布时间:2019-06-21

本文共 3200 字,大约阅读时间需要 10 分钟。

 

1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support import expected_conditions as EC 4 from selenium.webdriver.support.wait import WebDriverWait 5 from selenium.common.exceptions import TimeoutException 6 from pyquery import PyQuery as pq 7 import re 8 from config import * 9 import pymongo10 11 client = pymongo.MongoClient(MONGO_URL)12 db = client[MONGO_DB]13 browser = webdriver.Chrome()14 15 wait = WebDriverWait(browser, 10)16 17 18 def search():19     try:20         browser.get('https://www.taobao.com')21         input_ = wait.until(22             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))23         )24         submit = wait.until(25             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))        26         )27 28         input_.send_keys('xiaomi')29         submit.click()30 31         total = wait.until(32             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))33             )    34         get_products()35         return total.text    36     except TimeoutException:37         return search()38 39 def next_page(page_num):40     try:41         input_ = wait.until(42                 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))43             )44         submit = wait.until(45                 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))        46             )47         input_.clear()48         input_.send_keys(page_num)49         submit.click()50         wait.until(EC.text_to_be_present_in_element(51             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num)))52         get_products()53     except TimeoutException:54         next_page(page_num)55 56 def get_products():57     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))58     html = browser.page_source59     doc = pq(html)60     items = doc('#mainsrp-itemlist .items .item').items()61     for item in items:62         product ={63             'image': item.find('.pic .img').attr('src'),64             'price': item.find('.price').text(),65             'deal': item.find('.deal-cnt').text()[:-3],66             'title': item.find('.title').text(),67             'shop': item.find('.shop').text(),68             'location': item.find('.location').text()69 70         }71         print(product)72         save_to_mongo(product)73 74 def save_to_mongo(result):75     try:76         if db[MONGO_TABLE].insert(result):77             print('success save to mongodb', result)78     except Exception:79         print('error to mongo')80 81 def main():82     total = search()83     total = int(re.compile('(\d+)').search(total).group(1))84     # print(total)85     for i in range(2, total):86         next_page(i)87     browser.close()88 89 if __name__ == '__main__':90     main()

 

 config.py

1 MONGO_URL = 'localhost'2 MONGO_DB = 'taobao'3 MONGO_TABLE = 'product'

 

 

 

运行结果:

 数据库:

 

 

 

转载于:https://www.cnblogs.com/MC-Curry/p/9338906.html

你可能感兴趣的文章
Nginx + CGI/FastCGI + C/Cpp
查看>>
学习笔记------jsp页面与jsp标记
查看>>
DS博客作业02--线性表
查看>>
第三届ACM山东省赛I题_Chess_STL
查看>>
jQuery each和js forEach用法比较
查看>>
前端笔记-作用域链的一些理解加记录(JS高级程序设计读书笔记1)
查看>>
改造你的网站,变身 PWA
查看>>
Leetcode 142. Linked List Cycle IIJAVA语言
查看>>
网络基础5
查看>>
Exchange Supported operating system platforms
查看>>
unity3鼠标点击移动
查看>>
Linux 安装中文包
查看>>
谷物大脑
查看>>
访问控制-禁止php解析、user_agent,PHP相关配置
查看>>
AgileEAS.NET之系统架构
查看>>
python3.5里的正则表达式
查看>>
Exchange server 2013 SP1 客户端会议室邮箱自动回复延迟
查看>>
nginx反向代理缓存服务器构建
查看>>
RHEL6 搭建LVS/DR 负载均衡集群 案例
查看>>
以太坊·Rinkeby 测试网络
查看>>