看世界
 
昨日:篇  今日:篇   总帖:篇   会员:
admin
创始人Lv2   
无 pn     
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import urllib2

url = "https://www.anjuke.com/chengdu/cm/"
header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}

request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request, timeout=15)
data = response.read()
soup = BeautifulSoup(data, "html.parser")
# print soup.title.string
# soup.find_all(class_='P3')
list = soup.find_all(class_="P2a")

listurls = soup.find_all('a', href=re.compile('https://www.anjuke.com/chengdu/cm/[a-zA-Z]{4,15}/'))

for url in listurls:
    if not re.search('(.*zhoubian|p(\d+))', url['href']):

        print url.get_text(),url['href']

        urls = url['href']
        name = url.get_text()
        # print name.encode("utf8")

        with open("D:\\down\\loupan\\chengdu\\name.txt", "a") as f:

            f.write("['" + name.encode("utf8") + "','" + "pn" + "','" + url['href'].encode("utf8") + "'],")



out:

青羊 https://www.anjuke.com/chengdu/cm/qingyang/

锦江 https://www.anjuke.com/chengdu/cm/jinjiang/
金牛 https://www.anjuke.com/chengdu/cm/jinniu/
武侯 https://www.anjuke.com/chengdu/cm/wuhou/

 0  已被阅读了31次  楼主 2018-10-12 11:38:11
回复列表

回复:无 pn

LOOK官方站 联系站长
Powered by Look博客