Python编程入门学习笔记(七)

Python 学习笔记(七)

简单爬虫

python库
    1、request 用来获取页面内容
    2、BeautifulSoup 

    文档链接:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

爬取链家网的信息

    

安装第三方库

pip install requests
 pip install bs4

新建数据库:


CREATE DATABASE /*!32312 IF NOT EXISTS*/`house` /*!40100 DEFAULT CHARACTER SET utf8 */;

USE `house`;

/*Table structure for table `db_house` */

DROP TABLE IF EXISTS `db_house`;

CREATE TABLE `db_house` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `price` varchar(80) DEFAULT NULL,
  `unit` varchar(80) DEFAULT NULL,
  `area` varchar(80) DEFAULT NULL,
  `layout` varchar(80) DEFAULT NULL,
  `floor` varchar(80) DEFAULT NULL,
  `direction` varchar(80) DEFAULT NULL,
  `subway` varchar(80) DEFAULT NULL,
  `community` varchar(80) DEFAULT NULL,
  `location` varchar(80) DEFAULT NULL,
  `agent_name` varchar(80) DEFAULT NULL,
  `agent_id` varchar(80) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `id` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=40 DEFAULT CHARSET=utf8;

爬虫程序如下:

import time
import pymysql
import requests
from bs4 import BeautifulSoup
# 获取url下的页面内容,返回soup对象
def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html5lib')
    return soup

# 将以上的代码封装成一个获取链接的方法函数,作用是获取列表页下面的所有租房页面的链接,返回链接列表

def get_links(link_url):
    soup = get_page(link_url)
    links_div = soup.find_all('div',class_="pic-panel")
    links = [div.a.get('href') for div in links_div]
    return links

def get_house_info(house_url):
    soup = get_page(house_url)
    price = soup.find('span', class_='total').text
    unit = soup.find('span', class_='unit').text.strip()
    house_info = soup.find_all('p')
    area = house_info[0].text[3:]
    layout = house_info[1].text[5:]
    floor = house_info[2].text[3:]
    direction = house_info[3].text[5:]
    subway = house_info[4].text[3:]
    community = house_info[5].text[3:]
    location = house_info[6].text[3:]
    create_time = house_info[7].text[3:]
    agent = soup.find('a',class_ = 'name LOGCLICK')
    agent_name = agent.text
    agent_id = agent.get('data-el')
    evaluate = soup.find('div',class_='evaluate')
    score, number = evaluate.find('span', class_ = 'rate').text.split('/')
    times = evaluate.find('span',class_ = 'time').text[5:-1]
    info = {
        '价格': price,
        '单位': unit,
        '面积': area,
        '户型': layout,
        '楼层': floor,
        '朝向': direction,
        '发布时间': create_time,
        '地铁': subway,
        '小区': community,
        '位置': location,
        '经纪人姓名': agent_name,
        '经纪人ID': agent_id
    }
    return info

DATABASE = {
    'host': 'localhost', #如果是远程数据库,此处为远程服务器的ip地址
    'database': 'house',
    'user' : 'root',
    'password': 'toor',
    # 字符集编码,防止数据乱码
    'charset' : 'utf8'
}
    
def get_db(setting):
    return pymysql.connect(**setting)

def insert(db,house):
    values = "'{}',"* 10 + "'{}'"
    sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'],
                               house['楼层'],house['朝向'],house['地铁'],house['小区'],
                               house['位置'],house['经纪人姓名'],house['经纪人ID'])
    sql = """
        insert into db_house(`price`,`unit`,`area`,`layout`,`floor`,`direction`,`subway`,`community`,`location`,`agent_name`,`agent_id`)
        values({})
    """.format(sql_values)
    print(sql)
    cursor = db.cursor()
    cursor.execute(sql)
    db.commit()

db = get_db(DATABASE)
links = get_links('http://bj.lianjia.com/zufang/')
for link in links:
    time.sleep(2)
    print('获取一个房子信息成功!')
    house = get_house_info(link)
    print(house,end='\r')
    insert(db,house)

打开数据库,可以看到租房信息已经存储到mysql数据库。

Author: allengao

发表评论

电子邮件地址不会被公开。 必填项已用*标注