简单爬虫
python库
1、request 用来获取页面内容
2、BeautifulSoup
文档链接:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
爬取链家网的信息
安装第三方库
pip install requests pip install bs4
新建数据库:
CREATE DATABASE /*!32312 IF NOT EXISTS*/`house` /*!40100 DEFAULT CHARACTER SET utf8 */; USE `house`; /*Table structure for table `db_house` */ DROP TABLE IF EXISTS `db_house`; CREATE TABLE `db_house` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `price` varchar(80) DEFAULT NULL, `unit` varchar(80) DEFAULT NULL, `area` varchar(80) DEFAULT NULL, `layout` varchar(80) DEFAULT NULL, `floor` varchar(80) DEFAULT NULL, `direction` varchar(80) DEFAULT NULL, `subway` varchar(80) DEFAULT NULL, `community` varchar(80) DEFAULT NULL, `location` varchar(80) DEFAULT NULL, `agent_name` varchar(80) DEFAULT NULL, `agent_id` varchar(80) DEFAULT NULL, PRIMARY KEY (`id`), KEY `id` (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=40 DEFAULT CHARSET=utf8;
爬虫程序如下:
import time import pymysql import requests from bs4 import BeautifulSoup
# 获取url下的页面内容,返回soup对象 def get_page(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html5lib') return soup # 将以上的代码封装成一个获取链接的方法函数,作用是获取列表页下面的所有租房页面的链接,返回链接列表 def get_links(link_url): soup = get_page(link_url) links_div = soup.find_all('div',class_="pic-panel") links = [div.a.get('href') for div in links_div] return links def get_house_info(house_url): soup = get_page(house_url) price = soup.find('span', class_='total').text unit = soup.find('span', class_='unit').text.strip() house_info = soup.find_all('p') area = house_info[0].text[3:] layout = house_info[1].text[5:] floor = house_info[2].text[3:] direction = house_info[3].text[5:] subway = house_info[4].text[3:] community = house_info[5].text[3:] location = house_info[6].text[3:] create_time = house_info[7].text[3:] agent = soup.find('a',class_ = 'name LOGCLICK') agent_name = agent.text agent_id = agent.get('data-el') evaluate = soup.find('div',class_='evaluate') score, number = evaluate.find('span', class_ = 'rate').text.split('/') times = evaluate.find('span',class_ = 'time').text[5:-1] info = { '价格': price, '单位': unit, '面积': area, '户型': layout, '楼层': floor, '朝向': direction, '发布时间': create_time, '地铁': subway, '小区': community, '位置': location, '经纪人姓名': agent_name, '经纪人ID': agent_id } return info DATABASE = { 'host': 'localhost', #如果是远程数据库,此处为远程服务器的ip地址 'database': 'house', 'user' : 'root', 'password': 'toor', # 字符集编码,防止数据乱码 'charset' : 'utf8' } def get_db(setting): return pymysql.connect(**setting) def insert(db,house): values = "'{}',"* 10 + "'{}'" sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'], house['楼层'],house['朝向'],house['地铁'],house['小区'], house['位置'],house['经纪人姓名'],house['经纪人ID']) sql = """ insert into db_house(`price`,`unit`,`area`,`layout`,`floor`,`direction`,`subway`,`community`,`location`,`agent_name`,`agent_id`) values({}) """.format(sql_values) print(sql) cursor = db.cursor() cursor.execute(sql) db.commit()
db = get_db(DATABASE) links = get_links('http://bj.lianjia.com/zufang/') for link in links: time.sleep(2) print('获取一个房子信息成功!') house = get_house_info(link) print(house,end='\r') insert(db,house)
打开数据库,可以看到租房信息已经存储到mysql数据库。