自己做采集程序

  现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。

  首先去下载个XMLHTTP的类文件:

  <%

  Class xhttp

  private cset,sUrl,sError

  Private Sub Class_Initialize()

  'cset="UTF-8"

  cset="GB2312"

  sError=""

  end sub

  Private Sub Class_Terminate()

  End Sub

  Public Property LET URL(theurl)

  sUrl=theurl

  end property

  public property GET BasePath()

  BasePath=mid(sUrl,1,InStrRev(sUrl,"/")-1)

  end property

  public property GET FileName()

  FileName=mid(sUrl,InStrRev(sUrl,"/")+1)

  end property

  public property GET Html()

  Html=BytesToBstr(getBody(sUrl))

  end property

  public property GET xhttpError()

  xhttpError=sError

  end property

  private Function BytesToBstr(body)

  on error resume next

  'Cset:GB2312 UTF-8

  dim objstream

  set objstream = Server.CreateObject("adodb.stream")

  with objstream

  .Type = 1 '

  .Mode = 3 '

  .Open

  .Write body  '

  .Position = 0 '

  .Type = 2  '

  .Charset = Cset  '

  BytesToBstr = .ReadText '

  .Close

  end with

  set objstream = nothing

  End Function

  private function getBody(surl)

  on error resume next

  dim xmlHttp

  'Set xmlHttp=server.createobject("Msxml2.XMLHTTP.4.0")

  'set xmlHttp=server.createobject("Microsoft.XMLHTTP")

  set xmlHttp=server.createobject("MSXML2.ServerXMLHTTP")

  xmlHttp.setTimeouts 10000,10000,10000,30000

  xmlHttp.open "GET",surl,false

  xmlHttp.send

  if xmlHttp.readystate=4 then

  'if xmlHttp.status=200 then

  getBody=xmlhttp.responsebody

  'end if

  else

  getBody=""

  end if

  if Err.Number<>0 then

  sError=Err.Number

  Err.clear

  else

  sError=""

  end if

  set xmlHttp=nothing

  end function

  Public function saveimage(tofile,isoverwrite)

  on error resume next

  dim objStream,objFSO,imgs

  if Not isoverwrite Then

  Set objFSO = Server.CreateObject("Scripting.FileSystemObject")

  If objFSO.FileExists(Server.MapPath(tofile)) Then

  Exit Function

  End If

  Set objFSO = Nothing

  End IF

  imgs=getBody(sUrl)

  Set objStream = Server.CreateObject("ADODB.Stream")

  with objStream

  .Type =1

  .Open

  .write imgs

  .SaveToFile server.mappath(tofile),2

  .Close()

  end with

  set objstream=nothing

  end function

  end class

  %>

  用了这个类文件,做起事情来就方便多了。

  然后就可以分析采集网站的网页结构,写采集程序了。

  下面给个例子:

  <!--#include file="conn.asp"-->

  <!--#include file="inc/xhttp_class.asp"-->

  <!--#include file="inc/function.asp"-->

  <%

  server.ScriptTimeout = 1000

  %>

  <html>

  <head>

  <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />

  <title>BT采集器</title>

  </head>

  <body>

  <form name="form1" method="post" action="get81bt.asp">

  分类ID:

  <input type="text" name="cid" value="<%=request("cid")%>"><br>

  开始ID:

  <input type="text" name="startid" value="<%=request("startid")%>">

  <br>

  结束ID:

  <input type="text" name="overid" value="<%=request("overid")%>">

  <br>

  分类名称:<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取

  <br>

  <input name="action" type="hidden" id="action" value="getdata">

  <input type="submit" name="Submit" value="采集">

  </form>

  当前ID:<%=request("id")%> <br>

  <%

  dim action

  action = Request("action")

  if action = "getdata" then

  cid = Request("cid")

  startid = Request("startid")

  overid = Request("overid")

  id = Request("id")

  if id = "" then id = startid

  set objxhttp = new xhttp

  objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm"

  content = objxhttp.Html

  if InStr(content,"网站维护中") then

  call NextID

  response.End()

  end if

  list = GetContent(content,"<!--内容开始-->","<!--内容结束-->",0)

  Dim regEx, Match, Matches,patrn

  Set regEx = New RegExp

  patrn = "<a href=""../BtHtml/(.+?)"">"

  regEx.Pattern = patrn

  regEx.IgnoreCase = True

  regEx.Global = True

  Set Matches = regEx.Execute(list)

  on error resume next

  For Each Match in Matches

  'response.write Match.Value & "<br>"

  weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1")

  response.write weburl & "<br>"

  response.Flush()

  objxhttp.URL = weburl

  cpage = objxhttp.Html

  cpage = GetContent(cpage,"<!--内容开始-->","<!--内容结束-->",0)

  title = GetContent(cpage,"BT资源名称:<strong>","</strong>",0)

  title = stripHTML(title)

  IF Request("classname") <> "" then

  classname = Request("classname")

  Else

  if InStr(title,"喜剧") then

  classname = "喜剧"

  Elseif InStr(title,"动作") then

  classname = "动作"

  Elseif InStr(title,"惊悚") then

  classname = "惊悚"

  Elseif InStr(title,"犯罪") then

  classname = "犯罪"

  Elseif InStr(title,"恐怖") then

  classname = "恐怖"

  Elseif InStr(title,"爱情") then

  classname = "爱情"

  Elseif InStr(title,"冒险") then

  classname = "冒险"

  Elseif InStr(title,"科幻") then

  classname = "科幻"

  Elseif InStr(title,"悬念") then

  classname = "悬念"

  Elseif InStr(title,"奇幻") then

  classname = "奇幻"

  Elseif InStr(title,"战争") then

  classname = "战争"

  Elseif InStr(title,"连续剧") then

  classname = "连续剧"

  Elseif InStr(title,"综艺") then

  classname = "综艺"

  Elseif InStr(title,"灾难") then

  classname = "灾难"

  Elseif InStr(title,"伦理") then

  classname = "伦理"

  Elseif InStr(title,"动漫") or InStr(title,"动画") then

  classname = "动漫"

  Elseif InStr(title,"国语") or InStr(title,"集") then

  classname = "其他影视"

  Else

  classname = "其他"

  End if

  End IF

  intro = GetContent(cpage,"<tr><td width=770 bgcolor=#FFFFFF><div style=""margin:10px;line-height:150%"">","</div>",0)

  intro = Replace(intro,"<br />","[br]")

  intro = Replace(intro,"<BR />","[br]")

  intro = Replace(intro,"<BR>","[br]")

  intro = Replace(intro,"<br>","[br]")

  intro = Replace(intro,"<p>","[p]")

  intro = Replace(intro,"<P>","[p]")

  intro = Replace(intro,"</p>","[/p]")

  intro = Replace(intro,"</P>","[p]")

  intro = Replace(intro,"<img","[img")

  intro = Replace(intro,"<IMG","[img")

  intro = stripHTML(intro)

  intro = Replace(intro,"[br]","<br>")

  intro = Replace(intro,"[p]","<p>")

  intro = Replace(intro,"[/p]","</p>")

  intro = Replace(intro,"[img","<img")

  intro = Replace(intro,"[img]","<img src=")

  intro = Replace(intro,"[/img]",">")

  intro = Replace(intro,"[IMG]","<img src=")

  intro = Replace(intro,"[/IMG]",">")

  'response.write t

  'response.End()

  addtime = Trim(GetContent(cpage,"发布时间:"," ",0))

  if Not IsDate(addtime) then addtime = now()

  username = "bt"

  filesize = GetContent(content,"BT文件大小:"," ",0)

  title2 = title

  downurl = GetContent(cpage,"<a style=""color:red"" href=""","""",0)

  p = CDate(addtime)

  Dim sRnd

  Randomize

  sRnd = Int(900 * Rnd) + 100

  sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & ".torrent"

  url = "torrent/" & year(p) & "-" & month(p) & "-" & day(p) & "/" & sFileName

  Call CreateF(url)

  'Text

  Response.Write classname & "<br>"

  Response.write title & "<br>"

  'response.Write intro & "<br>"

  'response.Write addtime & "<br>"

  'response.Write username & "<br>"

  'response.Write filesize & "<br>"

  response.Write downurl & "<br>"

  response.Write url & "<br>"

  response.Flush()

  'response.End()

  'database

  if err.number = 0 then

  if (Not IsNull(title)) and title <> "" and downurl <> "" then

  set rs = server.CreateObject("adodb.recordset")

  sql = "select * from bt_class where classname = '" & classname & "'"

  rs.open sql,conn,1,3

  if rs.eof then

  rs.addnew

  rs("classname") = classname

  rs.update

  end if

  classid = rs("classid")

  rs.close

  set rs = nothing

  set rs = server.CreateObject("adodb.recordset")

  sql = "select * from bt_movie where title in ('" & title & "')"

  rs.open sql,conn,1,3

  if rs.eof then

  response.Write "<div><font color=blue>写入数据库...</font></div>"

  response.Flush()

  rs.addnew

  rs("classid") = classid

  rs("title") = title

  rs("title2") = title2

  rs("intro") = intro

  rs("username") = username

  rs("filesize") = filesize

  rs("url") = url

  rs("serverid") = 1

  rs("addtime") = addtime

  rs("ismake") = 0

  rs.update

  objxhttp.URL = downurl

  objxhttp.saveimage url,False

  else

  response.Write "<div><font color=red>已经存在!</font></div>"

  end if

  rs.close

  set rs = nothing

  'objxhttp.URL = downurl

  'objxhttp.saveimage url,False

  End IF

  Else

  err.clear

  End IF

  response.Write "-------------------------------------------<br>"

  Next

  set regEx = nothing

  response.Write "下一页<br>"

  response.Flush()

  Call NextID()

  end if

  Sub NextID

  conn.close

  set conn = nothing

  if cint(startid) < cint(overid) and cint(id) < cint(overid) then

  response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id + 1 &"'</script>"

  Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then

  response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id - 1 &"'</script>"

  Else

  Response.Write "采集完成!<br>"

  response.End()

  End if

  End Sub

  %>

  </body>

  </html>