看世界
 
昨日:篇  今日:篇   总帖:篇   会员:
admin
创始人Lv2   
获取楼盘名称     
# -*- coding: UTF-8 -*-
import os
import math
import urllib2
import urllib
import re
import time
import random
import sys
sys.path.append("../")
import MySQLdb
from ftplib import FTP
import shutil
import threading
import random
from random import choice

path= "D:\\down\\loupan\\chongqing\\"#文件路径
LS = [['九龙坡','10','https://www.anjuke.com/chongqing/cm/jiulongpo/']]


for l in LS:
    name = l[0]
    pn = l[1]
    print pn

    print 'name:' + name
    url = l[2]
    txtpath = url.split('/')[-2]
    city = url.split('/')[-4]

    print txtpath


    urls = ['https://www.anjuke.com/'+city+'/cm/'+txtpath+'/p{}'.format(i) for i in range(1,int(pn))]

    print urls


    time.sleep(1)
    header = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cache-Control':'no-cache',
            'User-Agent':'Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit /537.36(KHTML, like  Gecko) Chrome/70.0.3510.2 Safari/537.36'
        }


    for url in urls:
        request = urllib2.Request(url,headers=header)
        response = urllib2.urlopen(request, timeout=15)
        data = response.read()
        response.close()
        searchObj = re.findall(r'/'+city+ '/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I)
        # searchObj = re.findall(r'/chengdu/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I)
        for i in searchObj:
            print name +'·' +i +';',
            time.sleep(1.1)
            with open("D:\\down\\fenlei\\"+city+"\\{}.txt".format(txtpath), "a") as f:
                f.write(name +'·'+ str(i) +"," )
                f.close()

print 'D:\\down\\fenlei\\"+city+"\\{}.txt'

"D:\Program Files (x86)\Anaconda2\python.exe" "D:/Program Files/JetBrains/PyCharm 2017.3.3/helpers/pydev/爬楼批量1012.py"


 0  已被阅读了25次  楼主 2018-10-20 14:25:06
回复列表

回复:获取楼盘名称

LOOK官方站 联系站长
Powered by Look博客