如何实现新闻自动抓取?

我想做一个系统能够从一些别的网站抓新闻,然后动态添加到我的站点上来,请问如何实现?
新浪的新闻抓取
1。首页调用
<style type="text/css">
<!--
body {  font-size: 12px}
-->
</style>
<%
Server.ScriptTimeOut=120

'*********页面设置部分***********************************************************************

const m=40 '首页列出多少条新闻

const NeedTime=False '是否需要显示时间,True 表示显示时间 , False 表示不显示时间

const NewsLength=20 '新闻标题截取长度(不包括时间),注意截取了新闻长度就不能显示新闻时间

const Points="…" '截取长度后的标题要跟的省略号样子,可不填。

'*********************************************************************************************

dim wstr,str,url,start,over,i,News


on error resume next
url="http://dailynews.sina.com.cn/news1000.shtml"
wstr=getHTTPPage(url)
if err.number=0 then
start=newstring(wstr,"<!--新闻开始-->")
over=newstring(wstr,"<!--新闻结束-->")
wstr=mid(wstr,start+11,over-start-11)
wstr=replace(wstr,"<ul>","")
wstr=trim(replace(wstr,"</ul>",""))
' Set fs = CreateObject("Scripting.FileSystemObject")
' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
' f.writeLine wstr
' f.close
' set f = nothing
' set fs = nothing
str=split(wstr,"<li>")
If Unbound(str)<m then m=Unbound(str)
for i=1 to m
News=News&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
next
set str=nothing
else
wscript.echo err.description
end if


Sub writeLog(Msg)
On Error Resume Next
Dim f
Set f = fs.OpenTextFile(logfile,8,true)
f.WriteLine now & " - " & Msg
f.close
End Sub
function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function newstring(wstr,strng)
newstring=Instr(wstr,strng)
End Function

Function LeftNews(strng,NewsLength,NeedTime)
If NeedTime<>True then
Left_0=Instr(strng,"</a>")+3
TheRed=Instr(strng,"<font color=#ff0000>")
If TheRed>0 then
Left_1=Instr(strng,"<font color=#ff0000>")+20
Left_2=Instr(strng,"</font>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</font></a>"
End if
Else
Left_1=Instr(strng,"_blank>")+7
Left_2=Instr(strng,"</a>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</a>"
End if
End if
Else
LeftNews=strng
End if
End Function


Response.Write News '变量News为内容
%>
2。新闻列表
<style type="text/css">
<!--
body {  font-size: 12px}
-->
</style>

<a href="news.asp">首页</a>
<a href="news.asp?n=娱乐">娱乐</a>
<a href="news.asp?n=体育">体育</a>
<a href="news.asp?n=国内">国内</a>
<a href="news.asp?n=科技">科技</a>
<a href="news.asp?n=财经">财经</a>
<a href="news.asp?n=社会">社会</a>
<a href="news.asp?n=汽车">汽车</a>
<a href="news.asp?n=国际">国际</a>
<a href="news.asp?n=文教">文教</a>
<a href="news.asp?n=影音">影音</a>
<p>
<%
Server.ScriptTimeOut=120

'*********页面设置部分***********************************************************************

const m=10 '每个分类的新闻最多几条

const NeedTime=False '是否需要显示时间,True 表示显示时间 , False 表示不显示时间

const NewsLength=20 '新闻标题截取长度(不包括时间),注意截取了新闻长度就不能显示新闻时间

const Points="…" '截取长度后的标题要跟的省略号样子,可不填。

'*********************************************************************************************

dim wstr,str,url,start,over,NewsClass,i
dim n0,n1,n2,n3,n4,n5,n6,n7,n8,n9
n0=0
n1=0
n2=0
n3=0
n4=0
n5=0
n6=0
n7=0
n8=0
n9=0

NewsClass=trim(Request("n"))

on error resume next
url="http://dailynews.sina.com.cn/news1000.shtml" '新闻来源的页面
wstr=getHTTPPage(url) '取得页面内容
if err.number=0 then
start=newstring(wstr,"<!--新闻开始-->")
over=newstring(wstr,"<!--新闻结束-->")
wstr=mid(wstr,start+11,over-start-11)
wstr=replace(wstr,"href=""","href=""show.asp?url=")
wstr=replace(wstr,"<ul>","")
wstr=trim(replace(wstr,"</ul>","")) '完成对页面内容的截取加工
' Set fs = CreateObject("Scripting.FileSystemObject")
' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
' f.writeLine wstr
' f.close
' set f = nothing
' set fs = nothing
str=split(wstr,"<li>")
If NewsClass<>"" then '对分类新闻的截取
for i=1 to Ubound(str)
If Left(str(i),4)="["&NewsClass&"]" then
News=News&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
End if
next
Else '对所有新闻进行分类
for i=1 to Ubound(str)
If     Left(str(i),4)="[娱乐]" then
If n0<m then YuLe=YuLe&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n0=n0+1
Elseif Left(str(i),4)="[体育]" then
If n1<m then TiYu=TiYu&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n1=n1+1
Elseif Left(str(i),4)="[国内]" then
If n2<m then GuoNei=GuoNei&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n2=n2+1
Elseif Left(str(i),4)="[科技]" then
If n3<m then KeJi=KeJi&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n3=n3+1
Elseif Left(str(i),4)="[财经]" then
If n4<m then CaiJing=CaiJing&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n4=n4+1
Elseif Left(str(i),4)="[社会]" then
If n5<m then SheHui=SheHui&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n5=n5+1
Elseif Left(str(i),4)="[汽车]" then
If n6<m then QiChe=QiChe&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n6=n6+1
Elseif Left(str(i),4)="[国际]" then
If n7<m then GuoJi=GuoJi&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n7=n7+1
Elseif Left(str(i),4)="[影音]" then
If n8<m then YingYin=YingYin&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n8=n8+1
Elseif Left(str(i),4)="[文教]" then
If n9<m then WenJiao=WenJiao&"<li>"&LeftNews(str(i),NewsLength,NeedTime)
n9=n9+1
End if
next
End if
set str=nothing
else
wscript.echo err.description
end if

Sub writeLog(Msg)
On Error Resume Next
Dim f
Set f = fs.OpenTextFile(logfile,8,true)
f.WriteLine now & " - " & Msg
f.close
End Sub
function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function newstring(wstr,strng)
newstring=Instr(wstr,strng)
End Function

Function LeftNews(strng,NewsLength,NeedTime)
If NeedTime<>True then
Left_0=Instr(strng,"</a>")+3
TheRed=Instr(strng,"<font color=#ff0000>")
If TheRed>0 then
Left_1=Instr(strng,"<font color=#ff0000>")+20
Left_2=Instr(strng,"</font>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</font></a>"
End if
Else
Left_1=Instr(strng,"_blank>")+7
Left_2=Instr(strng,"</a>")
If Left_1+NewsLength>=Left_2 then
LeftNews=Left(strng,Left_0)
Else
LeftNews=Left(strng,Left_1+NewsLength)&Points&"</a>"
End if
End if
Else
LeftNews=strng
End if
End Function

'每个变量代表一个分类的新闻

Response.Write YuLe&"<p>"
Response.Write TiYu&"<p>"
Response.Write GuoNei&"<p>"
Response.Write KeJi&"<p>"
Response.Write CaiJing&"<p>"
Response.Write SheHui&"<p>"
Response.Write QiChe&"<p>"
Response.Write GuoJi&"<p>"
Response.Write YingYin&"<p>"
Response.Write WenJiao
'变量News是选择分类新闻后的变量
Response.Write News

%>
3。新闻内容
<%
Server.ScriptTimeOut=60
dim wstr,url,start,over,i


on error resume next
url=Request("url")
wstr=getHTTPPage(url)
if err.number=0 then
wstr=Autolink(wstr) '完成截取后的页面
' Set fs = CreateObject("Scripting.FileSystemObject") '把截下来的页面写在一个文件里
' Set f = fs.CreateTextFile(server.mappath("mynews.htm"))
' f.writeLine wstr
' f.close
' set f = nothing
' set fs = nothing
else
wscript.echo err.description
end if

function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear  
end function

Function bytes2BSTR(vIn)
dim strReturn
dim i,ThisCharCode,NextCharCode
strReturn = ""
For i = 1 To LenB(vIn)
ThisCharCode = AscB(MidB(vIn,i,1))
If ThisCharCode < &H80 Then
strReturn = strReturn & Chr(ThisCharCode)
Else
NextCharCode = AscB(MidB(vIn,i+1,1))
strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function

Function NewsString(wstr,strng)
NewsString=Instr(wstr,strng)
End Function

Function Autolink(strContent)
dim re
set re = New RegExp
re.IgnoreCase = True
re.Global = True
If Instr(url,"http://ent.")>0 then '影音和娱乐新闻的界面
start=NewsString(strContent,"<table width=604") '截取的起点
over=NewsString(strContent,"<center></center>") '截取的终点
strContent=mid(strContent,start,over-start) '截取新闻
re.Pattern = "\<table border=0(.[^\[]*)\<\/table>"
strContent = re.Replace(strContent,"") '去掉画中画广告
strContent = Replace(strContent,"/p>","") '去掉页面中一个奇怪的错误
strContent = Replace(strContent,"<table width=604 border=0 cellpadding=0 cellspacing=0>","")
strContent = Replace(strContent,"</table></table>","")
strContent = Replace(strContent,"<img src=http://image2.sina.com.cn/ent/news_rou.gif width=30 height=53>","")
strContent = Replace(strContent,"<img src=http://image2.sina.com.cn/ent/images/c.gif width=1 height=1>","<hr size=1 bgcolor=#d9d9d9>")
strContent = Replace(strContent,"bgcolor=#fff3ff","") '去掉背景颜色
strContent = Replace(strContent,"bgcolor=#bd6bff","") '去掉背景颜色
strContent = Replace(strContent,"width=603","width=100% ") '把一个定义了大小的表格放到最大
strContent = Replace(strContent,"width=554","width=100% ") '把一个定义了大小的表格放到最大
strContent = "<table width=100% border=0 cellspacing=0 cellpadding=10 align=center >"&strContent&"</td></tr></table>" '修补HTML的结构错误
Else '其他分类新闻的界面
start=NewsString(strContent,"<th class=f24>") '截取的起点
over=NewsString(strContent,"<br clear=all>") '截取的终点
strContent=mid(strContent,start,over-start) '截取新闻
re.Pattern = "\<table border=0(.[^\[]*)\<\/table>"
strContent = re.Replace(strContent,"") '去掉画中画广告
strContent = Replace(strContent,"/p>","") '去掉页面中一个奇怪的错误
strContent = "<table width=100% border=0 cellspacing=0 cellpadding=10 align=center >"&strContent&"</td></tr></table>" '修补HTML的结构错误
End if
Autolink=strContent
End Function

%>
<style type="text/css">
<!--
td {  font-size: 12px}
-->
</style>
<table width="770" border="0" cellspacing="0" cellpadding="10" align="center" class="line_l_r" bgcolor="#EEEEEE">
  <tr>
    <td>
  <% Response.Write wstr %>

</td>
  </tr>
</table>

300*300
  • 没有相关文章
  • 是不是搞错了,明明是ASP的内容却搞成了jQuery[2010-04-10 14:37:23]
 文章首页关于迷茫时代关于我写意人生
版权所有:迷茫时代 All rights reserved   
执行时间:0.00612 秒