上次为一家公司做的,但是当时时间紧张就丢下了,把程序放上来,希望对一些朋友有帮助。
# -*- coding: cp936 -*-
#author:digmouse
#website:www.htmldata.cn
#Date:2009-5-25
#
import html
import re
import urllib
import MySQLdb
import db
#download image
def down_img(url,name):
path=”c:/”+str(name)
url=’http://’+url
url=re.sub(’_small1′,”,url)
data=urllib.urlopen(url).read()
f=file(path,’wb’)
f.write(data)
f.close()
#得到车辆参数
def get_value(data):
#生产厂商
str_1=’<td>生产厂商:’
str_1_1=’</td>’
b_1=data.find(str_1)
if b_1==-1:
print ‘b_1 is error’
exit()
company=data[b_1+len(str_1):]
b_1_1=company.find(str_1_1)
if b_1_1==-1:
print ‘b_1_1 is error’
exit()
company=company[0:b_1_1]
company=html.strip_html(company)
company=html.trim(company)
print ‘生产厂商:’,company
#品牌
str_2=’<td>品牌:’
str_2_1=’</td>’
b_2=data.find(str_2)
if b_2==-1:
print ‘b_2 is error’
exit()
pp=data[b_2+len(str_2):]
b_2_1=pp.find(str_2_1)
if b_2_1==-1:
print ‘b_2_1 is error’
exit()
pp=pp[0:b_2_1]
pp=html.strip_html(pp)
pp=html.trim(pp)
print ‘品牌’,pp
exit()
def get_value_1(data):
#生产日期
str_2=’<td width=”70″>上市日期:</td>’
str_2_1=’</td>’
b_2=data.find(str_2)
if b_2==-1:
print ‘b_2 is error’
exit()
date=data[b_2+len(str_2):]
b_2_1=date.find(str_2_1)
if b_2_1==-1:
print ‘b_2_1 is error’
exit()
date=date[0:b_2_1]
date=html.strip_html(date)
date=html.trim(date)
print ‘上市日期’,date
#厂家报价
str_3=’<td width=”80″>厂方指导价:</td>’
str_3_1=’</td>’
b_3=data.find(str_3)
if b_3==-1:
print ‘b_3 is error’
exit()
bj=data[b_3+len(str_3):]
b_3_1=bj.find(str_3_1)
if b_3_1==-1:
print ‘b_3_1 is error’
exit()
bj=bj[0:b_3_1]
bj=html.strip_html(bj)
bj=html.trim(bj)
print ‘厂家报价’,bj
#下载图片
def get_image(data):
#print data
x=re.findall(r’www.sinaimg.cn.+?\.jpg’,data)
for i in x:
xx=re.sub(”\\\\”,”,i)
if xx.find(’_')==-1:
print xx
print
img_name=re.findall(r’/.+/.+/.+/(.+.jpg)’,xx)
img_name=img_name[0]
down_img(xx,img_name)
def get_all_url(data):
res=re.compile(r’<a href=”(.+?/[0-9]+)”‘)
urls=res.findall(data)
return urls
url=’http://auto.sina.com.cn/autocenter/’
data=html.get_html(url)
urls= get_all_url(data)
for url in urls:
print url
#获取url页面中的参数
data_2=html.get_html(url)
get_value_1(data_2)#获取参数
x=re.findall(r’[0-9]+’,url)
xx=x[0]#正则后的url中的数字
#获取图片的url后面的数字
pic_num=re.findall(r’picture.html#([0-9]+)’,data_2)#有的时候抓取不到
for num in pic_num:
new_url=’http://photo.auto.sina.com.cn/picture.php?p=’+num+’&r=7944&_=’#从新的url中获取大图的具体地址
data_3=html.get_html(new_url)
get_image(data_3) #get image info
new_url=’http://data.auto.sina.com.cn/car/ajax/carlist.php?subid=’+xx+’&product=0′
data_1=html.get_html(new_url)
num=re.findall(r’(?<=car_id”:”).*?(?=”)’,data_1)
for n in num:
url_2=’http://data.auto.sina.com.cn/car/car.php?carid=’+str(n)+’&tmp=canshu’
data_2=html.get_html(url_2)
get_value(data_2)#获取车辆的详细参数
#print data_1
# -*- coding: cp936 -*-
#author:digmouse
#website:www.htmldata.cn
#Date:2009-5-25
#
import html
import re
import urllib
import MySQLdb
import db
#download image
def down_img(url,name):
path=”c:/”+str(name)
url=’http://’+url
url=re.sub(’_small1′,”,url)
data=urllib.urlopen(url).read()
f=file(path,’wb’)
f.write(data)
f.close()
#得到车辆参数
def get_value(data):
#生产厂商
str_1=’<td>生产厂商:’
str_1_1=’</td>’
b_1=data.find(str_1)
if b_1==-1:
print ‘b_1 is error’
exit()
company=data[b_1+len(str_1):]
b_1_1=company.find(str_1_1)
if b_1_1==-1:
print ‘b_1_1 is error’
exit()
company=company[0:b_1_1]
company=html.strip_html(company)
company=html.trim(company)
print ‘生产厂商:’,company
#品牌
str_2=’<td>品牌:’
str_2_1=’</td>’
b_2=data.find(str_2)
if b_2==-1:
print ‘b_2 is error’
exit()
pp=data[b_2+len(str_2):]
b_2_1=pp.find(str_2_1)
if b_2_1==-1:
print ‘b_2_1 is error’
exit()
pp=pp[0:b_2_1]
pp=html.strip_html(pp)
pp=html.trim(pp)
print ‘品牌’,pp
exit()
def get_value_1(data):
#生产日期
str_2=’<td width=”70″ class=”bge8e8e8″>上市日期:</td>’
str_2_1=’</td>’
b_2=data.find(str_2)
if b_2==-1:
print ‘b_2 is error’
exit()
date=data[b_2+len(str_2):]
b_2_1=date.find(str_2_1)
if b_2_1==-1:
print ‘b_2_1 is error’
exit()
date=date[0:b_2_1]
date=html.strip_html(date)
date=html.trim(date)
print ‘上市日期’,date
#厂家报价
str_3=’<td width=”80″ class=”bge8e8e8″>厂方指导价:</td>’
str_3_1=’</td>’
b_3=data.find(str_3)
if b_3==-1:
print ‘b_3 is error’
exit()
bj=data[b_3+len(str_3):]
b_3_1=bj.find(str_3_1)
if b_3_1==-1:
print ‘b_3_1 is error’
exit()
bj=bj[0:b_3_1]
bj=html.strip_html(bj)
bj=html.trim(bj)
print ‘厂家报价’,bj
#下载图片
def get_image(data):
#print data
x=re.findall(r’www.sinaimg.cn.+?\.jpg’,data)
for i in x:
xx=re.sub(”\\\\”,”,i)
if xx.find(’_')==-1:
print xx
print
img_name=re.findall(r’/.+/.+/.+/(.+.jpg)’,xx)
img_name=img_name[0]
down_img(xx,img_name)
def get_all_url(data):
res=re.compile(r’<a href=”(.+?/[0-9]+)”‘)
urls=res.findall(data)
return urls
url=’http://auto.sina.com.cn/autocenter/’
data=html.get_html(url)
urls= get_all_url(data)
for url in urls:
print url
#获取url页面中的参数
data_2=html.get_html(url)
get_value_1(data_2)#获取参数
x=re.findall(r’[0-9]+’,url)
xx=x[0]#正则后的url中的数字
#获取图片的url后面的数字
pic_num=re.findall(r’picture.html#([0-9]+)’,data_2)#有的时候抓取不到
for num in pic_num:
new_url=’http://photo.auto.sina.com.cn/picture.php?p=’+num+’&r=7944&_=’#从新的url中获取大图的具体地址
data_3=html.get_html(new_url)
get_image(data_3) #get image info
new_url=’http://data.auto.sina.com.cn/car/ajax/carlist.php?subid=’+xx+’&product=0′
data_1=html.get_html(new_url)
num=re.findall(r’(?<=car_id”:”).*?(?=”)’,data_1)
for n in num:
url_2=’http://data.auto.sina.com.cn/car/car.php?carid=’+str(n)+’&tmp=canshu’
data_2=html.get_html(url_2)
get_value(data_2)#获取车辆的详细参数
#print data_1