看世界
 
昨日:篇  今日:篇   总帖:篇   会员:
今日:0
文章:0
今日:0
文章:5
IT
今日:0
文章:57
今日:0    总帖:65
admin
31
无 pn   
# -*- coding: utf-8 -*- import re from bs4 import BeautifulSoup import urllib2 url = "https://www.anjuke.com/chengdu/cm/" header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' } request = urllib2.Request(url, headers=header) response = urllib2.urlopen(request, timeout=15) data = response.read() soup = BeautifulSoup(data, "html.parser") # print soup.title.string # soup.find_all(class_='P3') list = soup.find_all(class_="P2a") listurls = soup.find_all('a', href=re.compile('https://www.anjuke.com/chengdu/cm/[a-zA-Z]{4,15}/')) for url in listurls: if not re.search('(.*zhoubian|p(\d+))', url['href']): print url.get_text(),url['href'] urls = url['href'] name = url.get_text() # print name.encode("utf8") with open("D:\\down\\loupan\\chengdu\\name.txt", "a") as f: f.write("['" + name.encode("utf8") + "','" + "pn" + "','" + url['href'].encode("utf8") + "'],") out:青羊 https://www.anjuke.com/chengdu/cm/qingyang/锦江 https://www.anjuke.com/chengdu/cm/jinjiang/金牛 https://www.anjuke.com/chengdu/cm/jinniu/武侯 https://www.anjuke.com/chengdu/cm/wuhou/
IT 0 0 59天前
admin
31
全部楼盘   
# -*- coding: UTF-8 -*- import os import math import urllib2 import urllib import re import time import random import sys sys.path.append("../") import MySQLdb from ftplib import FTP import shutil import threading import random from random import choice path= "D:\\down\\loupan\\chengdu\\" # LS = [['温江','7','www.anjuke.com/chengdu/cm/wenjiang/'],['青羊','pn','www.anjuke.com/chengdu/cm/qingyang/'],['锦江','pn','www.anjuke.com/chengdu/cm/jinjiang/'],['金牛','pn','www.anjuke.com/chengdu/cm/jinniu/'],['武侯','pn','www.anjuke.com/chengdu/cm/wuhou/'],['成华','pn','www.anjuke.com/chengdu/cm/chenghua/'],['高新区','pn','www.anjuke.com/chengdu/cm/gaoxin/'],['天府新区','pn','www.anjuke.com/chengdu/cm/tainfuxinqu/'],['龙泉驿','pn','www.anjuke.com/chengdu/cm/longquanyi/'],['双流','pn','www.anjuke.com/chengdu/cm/shuangliu/'],['都江堰','pn','www.anjuke.com/chengdu/cm/dujiangyan/'],['郫都','pn','www.anjuke.com/chengdu/cm/piduqu/'],['新都','pn','www.anjuke.com/chengdu/cm/xindu/'],['青白江','pn','www.anjuke.com/chengdu/cm/qingbaijiangqu/'],['新津','pn','www.anjuke.com/chengdu/cm/xinjinxian/'],['金堂','pn','www.anjuke.com/chengdu/cm/jintangxian/'],['彭州','pn','www.anjuke.com/chengdu/cm/pengzhoushi/'],['崇州','pn','www.anjuke.com/chengdu/cm/chongzhoushi/'],['大邑','pn','www.anjuke.com/chengdu/cm/dayixian/'],['邛崃','pn','www.anjuke.com/chengdu/cm/qionglaishi/'],['蒲江','pn','www.anjuke.com/chengdu/cm/cdpujiangxian/'],['简阳','pn','www.anjuke.com/chengdu/cm/jianyangsh/']] LS = [['大邑','2','www.anjuke.com/chengdu/cm/dayixian/'],['蒲江','2','www.anjuke.com/chengdu/cm/cdpujiangxian/']] # LS = [['都江堰','7','www.anjuke.com/chengdu/cm/dujiangyan/'],['郫都','9','www.anjuke.com/chengdu/cm/piduqu/'] # LS = ['新都','8','www.anjuke.com/chengdu/cm/xindu/'],['青白江','3','www.anjuke.com/chengdu/cm/qingbaijiangqu/'],['新津','3','www.anjuke.com/chengdu/cm/xinjinxian/'],['金堂','4','www.anjuke.com/chengdu/cm/jintangxian/'],['彭州','3','www.anjuke.com/chengdu/cm/pengzhoushi/'],['崇州','4','www.anjuke.com/chengdu/cm/chongzhoushi/'],['大邑','3','www.anjuke.com/chengdu/cm/dayixian/'],['邛崃','3','www.anjuke.com/chengdu/cm/qionglaishi/'],['蒲江','2','www.anjuke.com/chengdu/cm/cdpujiangxian/'],['简阳','3','www.anjuke.com/chengdu/cm/jianyangsh/'] for l in LS: name = l[0] pn = l[1] # print pn url = l[2] txtpath = url.split('/')[-2] # print txtpath for x in range(1, len(l)): # print x pass # for line in open(l): # lines = line.split(':') # # # print lines # # # pattern = re.compile(r':') # 查找, # result1 = pattern.findall(line) # max = len(result1) # # # print lines # # # # lin = line.split(',') # # # print lin[0] urls = ['https://www.anjuke.com/chengdu/cm/'+txtpath+'/p{}'.format(i) for i in range(1,int(pn))] print urls print 'L' + str(x) + ' = path + ' + "'" + txtpath + ".txt'" # time.sleep(6) header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'no-cache', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' } for url in urls: request = urllib2.Request(url,headers=header) response = urllib2.urlopen(request, timeout=15) data = response.read() response.close() # searchObj = re.findall(r'/shanghai/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I) searchObj = re.findall(r'/chengdu/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I) for i in searchObj: # print i +';', time.sleep(0.1) with open("D:\\down\\loupan\\test\\{}.txt".format(txtpath), "a") as f: f.write(name + str(i) +"," )
IT 0 0 59天前
admin
40
admin
38
遍历dict   
txtpath="D:\\down\\loupan\\chengdu\\1010.txt" dict_data = {} with open(txtpath, 'r') as f: for kv in [d.strip().split(':') for d in f]: dict_data[kv[0]] = kv[1] # print(dict_data) for (k,v) in dict_data.items(): print k print v
IT 0 0 61天前
admin
33
dict   
# -*- coding: utf-8 -*- dict={'one':1,'two':2,'three':3} print dict print dict.keys() print dict.values()
IT 0 0 61天前
admin
45
1010 有图   
[Image]
look财经 0 0 61天前
admin
47
小区楼盘   
#!/usr/bin/python # -*- coding: UTF-8 -*- #2018年4月20日16:52:59 完成 #21zgxdnet import os import math import urllib2 import urllib import re import time import random import sys sys.path.append("../") import MySQLdb from ftplib import FTP import shutil import threading import random from random import choice # urls = ['https://www.anjuke.com/shanghai/cm/putuo/p{}'.format(str(i)) for i in range(1,11)] # urls = ['https://www.anjuke.com/chengdu/cm/wuhou/p{}'.format(str(i)) for i in range(1,15)] urls = ['https://www.anjuke.com/chengdu/cm/gaoxin/p{}'.format(str(i)) for i in range(1,8)] # https://www.anjuke.com/shanghai/cm/pudong/p13 # https://www.anjuke.com/chengdu/cm/qingyang/p13/ # urls = ['https://www.yrw.com/products/list-direct-all-performance-1-createTimeDesc-{}.html'.format(str(i)) for i in range(1,11)] # urls = ['https://www.yrw.com/products/list-direct-all-performance-1-createTimeDesc-{}.html'.format(str(i)) for i in range(1,11)] header = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'no-cache', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' } for url in urls: request = urllib2.Request(url,headers=header) response = urllib2.urlopen(request, timeout=15) data = response.read() response.close() # searchObj = re.findall(r'/shanghai/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I) searchObj = re.findall(r'/chengdu/cm\d.*?/"\n\s*target="_blank">(.*?)</a></em>', data, re.M|re.I) for i in searchObj: print i +';', time.sleep(0.1) with open("D:\\down\\loupan\\chengdu\\gaoxin.txt", "a") as f: f.write("高新区" +"'" + str(i) +"'," ) pass
IT 0 0 62天前
admin
29
import re import os import MySQLdb db = MySQLdb.connect("127.0.0.1","root","********","data" ,charset="utf8") cursor = db.cursor() path= "D:\\down\\loupan\\chengdu\\" L1 = path + 'chenghua.txt' L2 = path + 'gaoxin.txt' # path= "D:\\down\\loupan\\test\\" # L1 = path + 'L1.txt' # L2 = path + 'L2.txt' LS=[L1,L2] for l in LS: for line in open(l): lines = line.split(',') pattern = re.compile(r',') # 查找, result1 = pattern.findall(line) n = 1 for ad in lines: sa =ad i = ad print ad sql = "INSERT INTO test(text,title) VALUES ('%s','%s')" % (sa,i) cursor.execute(sql) db.commit() n += 1 db.close() print 'ok' TXT 文件 只能 ',' 间隔,不能加 '', 结尾不要,
IT 0 0 62天前
admin
33
TEMP   
L1 = path + 'L1.txt'L2 = path + 'L2.txt'path= "D:\\down\\loupan\\test\\"
IT 0 0 62天前
admin
42
try   
#!/usr/bin/python # -*- coding: UTF-8 -*- try: fh = open("testfile", "w") fh.write("这是一个测试文件,用于测试异常!!") except IOError: print "Error: 没有找到文件或读取文件失败" else: print "内容写入文件成功" fh.close()
IT 0 0 62天前
LOOK官方站 联系站长
Powered by Look博客