博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python爬虫实战(三):爬网易新闻
阅读量:6849 次
发布时间:2019-06-26

本文共 3648 字,大约阅读时间需要 12 分钟。

代码:

# _*_ coding:utf-8 _*_import urllib2import re#import sys#reload(sys)#sys.setdefaultencoding('utf-8')    class Tool:    removeImg = re.compile(r'

') removeAddr = re.compile(r'

|') replaceLine = re.compile(r'|

|
|

') replaceTD = re.compile(r'') replacePara = re.compile(r'
') replaceBR = re.compile(r'
|
') removeExtraTag = re.compile(r'<.*?>') def replace(self,text): text = re.sub(self.removeImg,"",text) text = re.sub(self.removeAddr,"",text) text = re.sub(self.replaceLine,"\n",text) text = re.sub(self.replaceTD,"\t",text) text = re.sub(self.replacePara,"\n"+" ",text) text = re.sub(self.replaceBR,"\n",text) text = re.sub(self.removeExtraTag,"",text) return text.strip() class WYXW: def __init__(self,baseUrl): self.baseURL = baseUrl self.user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)' self.headers = {
'User-Agent':self.user_agent} #self.file = None self.fileName = u'网易新闻' self.tool = Tool() def get_homepage(self): url = self.baseURL request = urllib2.Request(url,headers = self.headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8','ignore') #print content#.encode('gbk','ignore') return content def extract_url(self,homepage): pattern = "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html" news_url = re.findall(pattern,homepage) #print news_url return news_url def extract_sub_web_time(self,sub_web): pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',re.S) time = re.findall(pattern,sub_web) print time[0] return time[0] def extract_sub_web_source(self,sub_web): pattern = re.compile(r'
(.*?)') source = re.findall(pattern,sub_web) print source[0] return source[0] def extract_sub_web_title(self,sub_web): #pattern = "
.+" #pattern = '

(.*?)

' pattern = re.compile(r'

(.*?)

',re.S) title = re.findall(pattern,sub_web) if title is not None: print title[0] return title[0] else: return None def extract_sub_web_content(self,sub_web): #pattern = "
" pattern = re.compile(r'
(.*?)
',re.S) content = re.findall(pattern,sub_web) #print content[0] if content is not None: return content[0] else: return None def writeData(self,fName): if fName is not None: file = open(fName + '.txt',"w+") else: file = open(self.fileName + '.txt',"w+") homepage = self.get_homepage() news_urls = self.extract_url(homepage) for url in news_urls: print url web = urllib2.urlopen(url).read() title = self.extract_sub_web_title(web).strip() content = self.extract_sub_web_content(web) time = self.extract_sub_web_time(web).strip() source = self.extract_sub_web_source(web).strip() if content is not None: content = self.tool.replace(content) news = title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n" file.write(news) sep = "\n" + "-------------------------------------------------------------------------" + "\n" file.write(sep) print u"新闻写入成功" + "\n" baseUrl = "http://news.163.com"wyxw = WYXW(baseUrl)wyxw.writeData(None)

 

转载地址:http://korul.baihongyu.com/

你可能感兴趣的文章
MVC模式基本理解
查看>>
开源 java CMS - FreeCMS2.8会员登录
查看>>
ps学习笔记 11,12 路径,色彩调整
查看>>
MDaemonV15 版本新特性介绍
查看>>
【Guava】基于guava的重试组件Guava-Retryer
查看>>
第三阶段计划
查看>>
Spring常用注解
查看>>
哥德巴赫猜想算法c#实现方法
查看>>
MongoDB---管理简析
查看>>
我的友情链接
查看>>
solr5.2.1-----环境搭建
查看>>
Tomcat源码学习(二)--Tomcat_7.0.70 启动分析
查看>>
MYSQL备份恢复
查看>>
linux启动_grub
查看>>
MyBatis的常见属性总结select、insert、update、delete
查看>>
运行脚本下的 类tail -f sed -n
查看>>
[Python]学习基础篇:字典
查看>>
观察者模式
查看>>
Android WebView缓存机制详解
查看>>
Linux iptables命令高级网络
查看>>