实例讲解ASP实现抓取网上房产信息

  <%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%>

  <!-- #include file="conn.ASP" -->

  <!-- #include file="inc/function.asp" -->

  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

  <html>

  <head>

  <title>Untitled Document</title>

  <meta http-equiv="Content-Type" content="text/html; charset=gb2312">

  <meta http-equiv="refresh" content="300;URL=steal_house.asp">

  </head>

  <body>

  <%

  on error resume next

  '

  Server.ScriptTimeout = 999999

  '========================================================

  '字符编码函数

  '====================================================

  Function BytesToBstr(body,code)

  dim objstream

  set objstream = Server.createObject("adodb.stream")

  objstream.Type = 1

  objstream.Mode =3

  objstream.Open

  objstream.Write body

  objstream.Position = 0

  objstream.Type = 2

  objstream.Charset =code

  BytesToBstr = objstream.ReadText

  objstream.Close

  set objstream = nothing

  End Function

  '取行字符串在另一字符串中的出现位置

  Function Newstring(wstr,strng)

  Newstring=Instr(lcase(wstr),lcase(strng))

  if Newstring<=0 then Newstring=Len(wstr)

  End Function

  '替换字符串函数

  function ReplaceStr(ori,str1,str2)

  ReplaceStr=replace(ori,str1,str2)

  end function

  '====================================================

  function ReadXML(url,code,start,ends)

  set oSend=createobject("Microsoft.XMLHTTP")

  SourceCode = oSend.open ("GET",url,false)

  oSend.send()

  ReadXml=BytesToBstr(oSend.responseBody,code )

  start=Instr(ReadXml,start)

  ReadXml=mid(ReadXml,start)

  ends=Instr(ReadXml,ends)

  ReadXml=left(ReadXml,ends-1)

  end function

  function SubStr(body,start,ends)

  start=Instr(body,start)

  SubStr=mid(body,start+len(start)+1)

  ends=Instr(SubStr,ends)

  SubStr=left(SubStr,ends-1)

  end function

  dim getcont,NewsContent

  dim url,title

  url="http://www.***.com"'新闻网址knowsky.com

  getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")

  getcont=RegexHtml(getcont)

  dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra

  dim ContactMan,Contact

  for i=2 to ubound(getcont)

  response.Write(getcont(i)&"__<br>")

  tempLink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" onClick")-10)

  tempLink=replace(tempLink,"../","")

  response.Write(i&":"&tempLink&"<br>")

  NewsContent=ReadXml(tempLink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")

  NewsContent=RemoveHtml(NewsContent)

  NewsContent=replace(NewsContent,VbCrLf,"")

  NewsContent=replace(NewsContent,vbNewLine,"")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent,"\n","")

  NewsContent=replace(NewsContent,chr(10),"")

  NewsContent=replace(NewsContent,chr(13),"")

  '===============get Content=======================

  response.Write(NewsContent)

  KeyId=SubStr(NewsContent,"列号:","信息类别:")

  NewsClass=SubStr(NewsContent,"类别:","所在城市:")

  City=SubStr(NewsContent,"城市:","房屋具体位置:")

  Position=SubStr(NewsContent,"位置:","房屋类型:")

  HouseType=SubStr(NewsContent,"类型:","楼层:")

  Level=SubStr(NewsContent,"楼层:","使用面积:")

  Area=SubStr(NewsContent,"面积:","房价:")

  Price=SubStr(NewsContent,"房价:","其他说明:")

  Demostra=SubStr(NewsContent,"说明:","联系人:")

  ContactMan=SubStr(NewsContent,"联系人:","联系方式:")

  Contact=SubStr(NewsContent,"联系方式:","信息来源:")

  response.Write("总序列号:"&KeyId&"<br>")

  response.Write("信息类别:"&NewsClass&"<br>")

  response.Write("所在城市:"&City&"<br>")

  response.Write("房屋具体位置:"&Position&"<br>")

  response.Write("房屋类型:"&HouseType&"<br>")

  response.Write("楼层:"&Level&"<br>")

  response.Write("使用面积:"&Area&"<br>")

  response.Write("房价:"&Price&"<br>")

  response.Write("其他说明:"&Demostra&"<br>")

  response.Write("联系人:"&ContactMan&"<br>")

  response.Write("联系方式:"&Contact&"<br>")

  'title=RemoveHTML(aa(i))

  'response.Write("title:"&title)

  for n=0 to application.Contents.count

  if(application.Contents(n)=KeyId) then

  ifexit=true

  end if

  next

  if not ifexit then

  application(time&i)=KeyId

  '添加到数据库

  '====================================================

  set rs=server.createObject("adodb.recordset")

  rs.open "select top 1 * from news order by id desc",conn,3,3

  rs.addnew

  rs("NewsClass")=NewsClass

  rs("City")=City

  rs("Position")=Position

  rs("HouseType")=HouseType

  rs("Level")=Level

  rs("Area")=Area

  rs("Price")=Price

  rs("Demostra")=Demostra

  rs("ContactMan")=ContactMan

  rs("Contact")=Contact

  rs.update

  rs.close

  set rs=nothing

  end if

  '==================================================

  next

  function RemoveTag(body)

  Set regEx = New RegExp

  regEx.Pattern = "<[a].*?<\/[a]>"

  regEx.IgnoreCase = True

  regEx.Global = True

  Set Matches = regEx.Execute(body)

  dim i,arr(15),ifexit

  i=0

  j=0

  For Each Match in Matches

  TempStr = Match.Value

  TempStr=replace(TempStr,"<td>","")

  TempStr=replace(TempStr,"</td>","")

  TempStr=replace(TempStr,"<tr>","")

  TempStr=replace(TempStr,"</tr>","")

  arr(i)=TempStr

  i=i+1

  if(i>=15) then

  exit for

  end if

  Next

  Set regEx=nothing

  Set Matches =nothing

  RemoveTag=arr

  end function

  function RegexHtml(body)

  dim r_arr(47),r_temp

  Set regEx2 = New RegExp

  regEx2.Pattern ="<a.*?<\/a>"

  regEx2.IgnoreCase = True

  regEx2.Global = True

  Set Matches2 = regEx2.Execute(body)

  iii=0

  For Each Match in Matches2

  r_arr(iii)=Match.Value

  iii=iii+1

  Next

  RegexHtml=r_arr

  set regEx2=nothing

  set Matches2=nothing

  end function

  '======================================================

  conn.close

  set conn=nothing

  %>

  </body>

  </html>

  function.asp

  <%

  '**************************************************

  '函数名:gotTopic

  '作 用:截字符串,汉字一个算两个字符,英文算一个字符

  '参 数:str ----原字符串

  ' strlen ----截取长度

  '返回值:截取后的字符串

  '**************************************************

  function gotTopic(str,strlen)

  if str="" then

  gotTopic=""

  exit function

  end if

  dim l,t,c, i

  str=replace(replace(replace(replace(str," "," "),""",chr(34)),">",">"),"<","<")

  str=replace(str,"?","")

  l=len(str)

  t=0

  for i=1 to l

  c=Abs(Asc(Mid(str,i,1)))

  if c>255 then

  t=t+2

  else

  t=t+1

  end if

  if t>=strlen then

  gotTopic=left(str,i) & "…"

  exit for

  else

  gotTopic=str

  end if

  next

  gotTopic=replace(replace(replace(replace(gotTopic," "," "),chr(34),"""),">",">"),"<","<")

  end function

  '=========================================================

  '函数:RemoveHTML(strHTML)

  '功能:去除HTML标记

  '参数:strHTML --要去除HTML标记的字符串

  '=========================================================

  Function RemoveHTML(strHTML)

  Dim objRegExp, Match, Matches

  Set objRegExp = New Regexp

  objRegExp.IgnoreCase = True

  objRegExp.Global = True

  '取闭合的<>

  objRegExp.Pattern = "<.+?>"

  '进行匹配

  Set Matches = objRegExp.Execute(strHTML)

  ' 遍历匹配集合,并替换掉匹配的项目

  For Each Match in Matches

  strHtml=Replace(strHTML,Match.Value,"")

  Next

  RemoveHTML=strHTML

  Set objRegExp = Nothing

  set Matches=nothing

  End Function

  %>

  conn.asp

  <%

  'on error resume next

  set conn=server.createObject("adodb.connection")

  con= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb")

  conn.open con

  sub connclose

  conn.close

  set conn=nothing

  end sub

  %>