看世界
 
昨日:篇  今日:篇   总帖:篇   会员:
admin
创始人Lv2   
三步采集     
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
import urllib2

url = "https://www.anjuke.com/shanghai/cm/"
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}

request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request, timeout=15)
data = response.read()
soup = BeautifulSoup(data, "html.parser")
# print soup.title.string
# soup.find_all(class_='P3')
list = soup.find_all(class_="P2a")
# pn = soup.find_all(class_="P4")

# print pn 2018年10月18日 10:00:00






listurls = soup.find_all('a', href=re.compile('https://www.anjuke.com/shanghai/cm/[a-zA-Z]{4,15}/'))





for url in listurls:
if not re.search('(.*zhoubian|p(\d+))', url['href']):

# print url.get_text(),url['href']
urls = url['href']
name = url.get_text()

# print url.get_text()







# m = re.split('\d+', 'url.get_text()')
# #
# print url.get_text()
# if re.search('((\d+))', url['href']):
# print int(url.get_text()[-1])

request = urllib2.Request(urls, headers=header)
response = urllib2.urlopen(request, timeout=15)
pn = response.read()
soup = BeautifulSoup(pn, "html.parser")

pn = soup.find(class_="P4").get_text()

# if re.search('((\d+))', pn):
pn = pn.encode('utf-8')

# print pn

# pn = re.sub('\s', '', pn)
pn = re.sub('上一页|下一页', '', pn)
pn = re.sub('\s', '', pn)


print pn.split()

# print name.encode("utf8")

with open("D:\\down\\loupan\\shanghai\\shanghai-1012.txt", "a") as f:

f.write("['" + name.encode("utf8") + "','" + pn + "','" + url['href'].encode("utf8") + "'],")






#



#二 获取 具体小区名称 --

# -*- coding: UTF-8 -*-
import os
import math
import urllib2
import urllib
import re
import time
import random
import sys
sys.path.append("../")
import MySQLdb
from ftplib import FTP
import shutil
import threading
import random
from random import choice

path= "D:\\down\\loupan\\chengdu\\"

# LS = [['大邑','3','www.anjuke.com/chengdu/cm/dayixian/'],['蒲江','2','www.anjuke.com/chengdu/cm/cdpujiangxian/']]

# LS = [['温江','7','https://www.anjuke.com/chengdu/cm/wenjiang/'],['龙泉驿','7','https://www.anjuke.com/chengdu/cm/longquanyi/'],['双流','7','https://www.anjuke.com/chengdu/cm/shuangliu/'],['都江堰','7','https://www.anjuke.com/chengdu/cm/dujiangyan/'],['郫都','9','https://www.anjuke.com/chengdu/cm/piduqu/'],['新都','8','https://www.anjuke.com/chengdu/cm/xindu/'],['青白江','3','https://www.anjuke.com/chengdu/cm/qingbaijiangqu/'],['新津','3','https://www.anjuke.com/chengdu/cm/xinjinxian/'],['金堂','4','https://www.anjuke.com/chengdu/cm/jintangxian/'],['彭州','3','https://www.anjuke.com/chengdu/cm/pengzhoushi/'],['崇州','4','https://www.anjuke.com/chengdu/cm/chongzhoushi/'],['大邑','3','https://www.anjuke.com/chengdu/cm/dayixian/']]

# LS = [['简阳','3','https://www.anjuke.com/chengdu/cm/jianyangsh/'],['邛崃','3','https://www.anjuke.com/chengdu/cm/qionglaishi/']]
LS = [['闵行','16','https://www.anjuke.com/shanghai/cm/minhang/'],['宝山','12','https://www.anjuke.com/shanghai/cm/baoshan/'],['徐汇','16','https://www.anjuke.com/shanghai/cm/xuhui/'],['松江','11','https://www.anjuke.com/shanghai/cm/songjiang/'],['嘉定','11','https://www.anjuke.com/shanghai/cm/jiading/'],['浦东','34','https://www.anjuke.com/shanghai/cm/pudong/'],['闵行','16','https://www.anjuke.com/shanghai/cm/minhang/'],['宝山','12','https://www.anjuke.com/shanghai/cm/baoshan/'],['徐汇','16','https://www.anjuke.com/shanghai/cm/xuhui/'],['松江','11','https://www.anjuke.com/shanghai/cm/songjiang/'],['嘉定','17','https://www.anjuke.com/shanghai/cm/jiading/'],['静安','17','https://www.anjuke.com/shanghai/cm/jingan/'],['普陀','11','https://www.anjuke.com/shanghai/cm/putuo/'],['杨浦','13','https://www.anjuke.com/shanghai/cm/yangpu/'],['虹口','12','https://www.anjuke.com/shanghai/cm/hongkou/'],['长宁','12','https://www.anjuke.com/shanghai/cm/changning/'],['黄浦','13','https://www.anjuke.com/shanghai/cm/huangpu/'],['青浦','9','https://www.anjuke.com/shanghai/cm/qingpu/'],['奉贤','7','https://www.anjuke.com/shanghai/cm/fengxian/'],['金山','5','https://www.anjuke.com/shanghai/cm/jinshan/'],['崇明','3','https://www.anjuke.com/shanghai/cm/chongming/']]

for l in LS:
name = l[0]
pn = l[1]
print pn
url = l[2]
txtpath = url.split('/')[-2]
# print txtpath
# for x in range(1, len(l)):
# print x
# # pass

urls = ['https://www.anjuke.com/shanghai/cm/'+txtpath+'/p{}'.format(i) for i in range(1,int(pn))]
print urls
time.sleep(6)
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'no-cache',
'User-Agent':'Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit /537.36(KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36'
}
for url in urls:
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request, timeout=15)
data = response.read()
response.close()
searchObj = re.findall(r'/shanghai/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I)
# searchObj = re.findall(r'/chengdu/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I)
for i in searchObj:
print i +';',
time.sleep(0.7)
with open("D:\\down\\loupan\\shanghai\\{}.txt".format(txtpath), "a") as f:
f.write(name + str(i) +"," )

#"D:\Program Files (x86)\Anaconda2\python.exe" "D:/Program Files/JetBrains/PyCharm 2017.3.3/helpers/pydev/爬楼批量1012.py"
#34
#民乐城秀园西苑; 三林苑; 宣桥枫庭; 仁恒滨江园(三期);


#打印 文件夹 文件数及文件名

#
# -*- coding: utf-8 -*-
__author__ = 'Administrator'

# import os
# path = os.getcwd() #获取当前路径
# count = 0
# for root,dirs,files in os.walk(path): #遍历统计
# for each in files:
# count += 1 #统计文件夹下文件个数
# print count #输出结果



#统计 /home/dir/ 下的文件夹个数
import os
path ="D:\\down\\loupan\\1012"
count = 0
for fn in os.listdir(path): #fn 表示的是文件名
count = count+1
print "L"+ str(count) + ' = '+' path ' + "+'"+fn+"'"
# print count
for i in range(1,count+1):
print 'L'+str(i)+',',



##
统计文件夹下文件个数

## 发布到数据库


# -*- coding: utf-8 -*-
import MySQLdb
import re
import random
from random import choice
import time
from faker import Faker
fake = Faker("zh_CN")

# 连接MySQL数据库
# db = MySQLdb.connect("127.0.0.1", "root", "*****%", "data", charset="utf8")



db = MySQLdb.connect(host="cd.zgxd.net", port=3306, user="cd_zgxd_net",passwd="*****BI5%",db="cd_zgxd_net",charset="utf8")
cursor = db.cursor()

# for i in range(1,5):

# file = 'D:\down\shanghai\Jjingan.txt'
# file = 'D:\down\shanghai\TCD.txt'
# file = 'D:\down\P\PChengdu\T.txt'

# file = 'D:\\down\\loupan\\chengdu\\TCD.txt'
path= "D:\\down\\loupan\\1012\\"

L1 = path +'cdpujiangxian.txt'
L2 = path +'chongzhoushi.txt'
L3 = path +'dujiangyan.txt'
L4 = path +'jianyangsh.txt'
L5 = path +'jintangxian.txt'
L6 = path +'longquanyi.txt'
L7 = path +'pengzhoushi.txt'
L8 = path +'piduqu.txt'
L9 = path +'qingbaijiangqu.txt'
L10 = path +'qionglaishi.txt'
L11 = path +'shuangliu.txt'
L12 = path +'wenjiang.txt'
L13 = path +'xindu.txt'
L14 = path +'xinjinxian.txt'

LS=[L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14]
for l in LS:
sale = ['销售', '出售', '低转', '转让', '出售', '低售', '急售', '诚意出售', '亏本出售','亏转','急转','个人转让','个人出售','诚售']
pp = ['REALSTAR', '绿奥', '洁丰', '奥维斯特', '玫瑰园', '伊斯曼', '布兰妮', '正章', '赛维', '航星', 'CAS', '卡柏', '衣之恋', '衣适家', '灰姑娘', '德奈福',
'伊卡诺', '洁神', '约瑟芬', '依莱尔', '净衣馆', '朵拉', '阿玛尼', '象王', '万星', '雄狮', '福奈特', '澳洁', '泰洁', '伊尔萨', '玛丽阿姨', '蕾奇尔', '尤萨',
'威特斯', '施柏丽', '康洁', '美涤', '格利特', '执燊', '石油', '绿奥', '洁丰', '奥维斯特', '玫瑰园', '伊斯曼', '布兰妮', '多美依', '正章', '赛维', 'CAS', '卡柏',
'衣之恋', '衣适家', '灰姑娘', '德奈福', '伊卡诺', '洁神', '约瑟芬', '喜兰妮', '净衣馆', '朵拉', '阿玛尼', '象王', '美一天', '涤派', '福奈特', '澳洁', '泰洁',
'玛丽阿姨', '蕾奇尔', '尤萨', '威特斯', '伊尔萨', '澳洁', '泰洁', '施柏丽', '航星', '绿奥', '洁丰', '奥维斯特', '玫瑰园', '伊斯曼', '布兰妮', '正章', '赛维',
'CAS', 'UCC', '卡柏', '衣之恋', '衣适家', '灰姑娘', '德奈福', '伊卡诺', '洁神', '约瑟芬', '澳贝森', '净衣馆', '阿玛尼', '朵拉', '象王', '雪芙莱', '凯瑟琳',
'鼎好', '天天新', '皇家圣雪', '贝朗', '优力美','维特妮','汉洁'
]
sb = ['干洗机', '水洗机', '干洗机', '干洗设备', '干洗店设备', '水洗设备', '干洗店设备', '洗涤设备', '宾馆洗衣房设备', '洗衣房设备', '烘干机', '烘干设备']

cs = ['八成新', '九成新', '全新', '', '九五新', '九九新', '八五新', '']
JG = ['9000','8000','7000','6000','5000','4000','3000','8500','7500','6500','5500','4500','3500','面议','电议']

zs = [',赠送', ',另有', ',', ',加送', ',免费送']
other = ['熨烫设备', '缝纫机', '消毒柜', '服装传输线', '干洗材料', '水洗材料', '四氯乙烯', '干洗耗材']
rl = ['8公斤','10公斤','12公斤','15公斤','20公斤','25公斤','30公斤','10公斤','50公斤']
sql = "select max(id) from `phome_ecms_news`"
cursor.execute(sql)
id_info = cursor.fetchone()

ClassId = '13'
# QQ = '527573696'
Writer = "上海干洗"
Befrom = "zgxd.net"
Company = "上海干洗加盟"
# Phone = '13983000191'
dq = '成都市'
print id_info
if id_info[0] != None:
g_id = id_info[0]
else:
g_id = 0

for line in open(l):
lines = line.split(',')

pattern = re.compile(r',') # 查找,
result1 = pattern.findall(line)
max = len(result1) + 1

print 'max:' + str(max)

total = max + g_id

print "total:" +str(total)
n = 1
for ad in lines:
g_id = g_id + 1
dianji_i = random.randint(16858, 39899)
titlepic_i = random.randint(1, 265)
sa = choice(sale)
pp1 = choice(pp)
sb1 = choice(sb)
cs1 = choice(cs)
zs1 = choice(zs)
rl1 = choice(rl)
other1 = choice(other)
man = fake.last_name().encode('utf-8') + '先生' # 男性姓名
woman = fake.last_name().encode('utf-8') + '女士' # 女性姓名
QQ = random.randint(56858, 899989988)
name = [man, woman]
name1 = choice(name)
data = str(fake.date_between(start_date="-2y", end_date="now"))
phone = str(fake.phone_number())
dq = '成都' + ad
Company = '成都' + pp1

i = sa + cs1 + pp1 + sb1
smalltext = '成都' + ad + i + zs1 + other1 + '.联系人:' + name1

newstext = smalltext + ',' + '出厂日期:' + data + '.' + pp1 + sb1 + '.规格:' + rl1
# dianji_i = '999'
smalltext = newstext
detail_title = i
detail_smalltext = newstext
detail_infotags = pp1
detail_keyboard = sb1
detail_titlepic = 'http://www.pictutu.com/img/ganxiji/' + str(titlepic_i) + '.jpg'
jiage = choice(JG)

sql = "insert into `phome_ecms_news` (`id`, `classid`, `onclick`, `newspath`, `keyboard`, `keyid`, `userid`, `username`, `ztid`, `checked`, `istop`, `truetime`, `ismember`, `dokey`, `userfen`, `isgood`, `titlefont`, `titleurl`, `filename`, `groupid`, `newstempid`, `plnum`, `firsttitle`, `isqf`, `totaldown`, `title`, `newstime`, `titlepic`, `closepl`, `havehtml`, `lastdotime`, `haveaddfen`, `infopfen`, `infopfennum`, `votenum`, `ftitle`, `smalltext`, `diggtop`, `stb`, `copyids`, `ttid`, `infotags`, `ispic`, `phone`) values ( %s, %s, %s, '%s', '%s', '', '1', 'admin', '', '1', '0', '%s', '0', '1', '0', '0', '', '', '%s', '0', '0', '0', '0', '0', '0', '%s', '%s', '%s', '0', '1', '%s', '0', '0', '0', '0', '', '%s', '0', '1', '1', '0', '%s', '1', %s);" % (
g_id, ClassId, dianji_i, time.strftime("%Y-%m-%d", time.localtime()), detail_keyboard, int(time.time()), g_id,
detail_title, int(time.time()), detail_titlepic, int(time.time()), detail_smalltext, detail_infotags, phone) # 写入数据库

# sql = "INSERT INTO test(text,title) VALUES ('%s','%s')" % (sa,i)
cursor.execute(sql)
sql = "insert into `phome_ecms_news_data_1` (`id`, `classid`, `writer`, `befrom`, `newstext`, `company`, `qq`, `jiage`, `dq`, `pp`,`cs`,`sb`,`type`) values ( %s, %s, '%s', '%s', '%s', '%s', %s, '%s','%s','%s','%s','%s','%s');" % (
g_id, ClassId, name1, Befrom, newstext, Company, QQ, jiage,dq,pp1,cs1,sb1,rl1) # Company
cursor.execute(sql)

try:

db.commit()
n += 1
except:
db.rollback()
print("insert error")
db.close()
print '插入完成' + str(max) + '记录'
print '当前已经完成的ID:' + str(total)

 0  已被阅读了54次  楼主 2018-10-12 20:34:26
回复列表
guest
2F
游客Lv0 

AD1 =  path2 +'binjiangb.txt'

AD2 =  path2 +'chunan.txt'

AD3 =  path2 +'fuyang.txt'

AD4 =  path2 +'gongshu.txt'

AD5 =  path2 +'jiande.txt'

AD6 =  path2 +'jianggan.txt'

AD7 =  path2 +'linanq.txt'

AD8 =  path2 +'shangcheng.txt'

AD9 =  path2 +'tonglu.txt'

AD10 =  path2 +'xiacheng.txt'

AD11 =  path2 +'xiaoshan.txt'

AD12 =  path2 +'xihu.txt'

AD13 =  path2 +'yuhang.txt'

AD1, AD2, AD3, AD4, AD5, AD6, AD7, AD8, AD9, AD10, AD11, AD12, AD13,

 0   17天前 回复

回复:三步采集

LOOK官方站 联系站长
Powered by Look博客