看世界
 
昨日:篇  今日:篇   总帖:篇   会员:
admin
创始人Lv2   
楼盘,名称,pn,url     
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import urllib2

url = "https://www.anjuke.com/chengdu/cm/"
header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}

request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request, timeout=15)
data = response.read()
soup = BeautifulSoup(data, "html.parser")
# print soup.title.string
# soup.find_all(class_='P3')
list = soup.find_all(class_="P2a")
# pn = soup.find_all(class_="P4")

# print pn






listurls = soup.find_all('a', href=re.compile('https://www.anjuke.com/chengdu/cm/[a-zA-Z]{4,15}/'))





for url in listurls:
    if not re.search('(.*zhoubian|p(\d+))', url['href']):

        # print url.get_text(),url['href']
        urls = url['href']
        name = url.get_text()

        # print url.get_text()







        # m = re.split('\d+', 'url.get_text()')
        # #
        # print url.get_text()
        # if re.search('((\d+))',  url['href']):
        #     print int(url.get_text()[-1])

        request = urllib2.Request(urls, headers=header)
        response = urllib2.urlopen(request, timeout=15)
        pn = response.read()
        soup = BeautifulSoup(pn, "html.parser")

        pn = soup.find(class_="P4").get_text()

        # if re.search('((\d+))', pn):
        pn = pn.encode('utf-8')

        # print pn

        # pn = re.sub('\s', '', pn)
        pn =  re.sub('上一页|下一页', '', pn)
        pn = re.sub('\s', '', pn)


        print pn





        # print name.encode("utf8")

        with open("D:\\down\\loupan\\chengdu\\city-1012.txt", "a") as f:

            f.write("['" + name.encode("utf8") + "','" + pn + "','" + url['href'].encode("utf8") + "'],")
        #


页码:

 12345678910

 12345678910
 12345678910
 0  已被阅读了38次  楼主 2018-10-12 14:09:46
回复列表

回复:楼盘,名称,pn,url

LOOK官方站 联系站长
Powered by Look博客