自己做采集程序

　　现在网上的采集程序很多，但是有时候你发现一个好的网站，想自己做个采集工具采集一些信息，就需要自己去写程序了，其实这样的采集程序并不难写，主要是去分析源网站的网页结构。

　　首先去下载个XMLHTTP的类文件：

　　Class xhttp

　　private cset,sUrl,sError

　　Private Sub Class_Initialize()

　　'cset="UTF-8"

　　cset="GB2312"

　　sError=""

　　end sub

　　Private Sub Class_Terminate()

　　End Sub

　　Public Property LET URL(theurl)

　　sUrl=theurl

　　end property

　　public property GET BasePath()

　　BasePath=mid(sUrl,1,InStrRev(sUrl,"/")-1)

　　end property

　　public property GET FileName()

　　FileName=mid(sUrl,InStrRev(sUrl,"/")+1)

　　end property

　　public property GET Html()

　　Html=BytesToBstr(getBody(sUrl))

　　end property

　　public property GET xhttpError()

　　xhttpError=sError

　　end property

　　private Function BytesToBstr(body)

　　on error resume next

　　'Cset:GB2312 UTF-8

　　dim objstream

　　set objstream = Server.CreateObject("adodb.stream")

　　with objstream

　　.Type = 1 '

　　.Mode = 3 '

　　.Open

　　.Write body '

　　.Position = 0 '

　　.Type = 2 '

　　.Charset = Cset '

　　BytesToBstr = .ReadText '

　　.Close

　　end with

　　set objstream = nothing

　　End Function

　　private function getBody(surl)

　　on error resume next

　　dim xmlHttp

　　'Set xmlHttp=server.createobject("Msxml2.XMLHTTP.4.0")

　　'set xmlHttp=server.createobject("Microsoft.XMLHTTP")

　　set xmlHttp=server.createobject("MSXML2.ServerXMLHTTP")

　　xmlHttp.setTimeouts 10000,10000,10000,30000

　　xmlHttp.open "GET",surl,false

　　xmlHttp.send

　　if xmlHttp.readystate=4 then

　　'if xmlHttp.status=200 then

　　getBody=xmlhttp.responsebody

　　'end if

　　else

　　getBody=""

　　end if

　　if Err.Number<>0 then

　　sError=Err.Number

　　Err.clear

　　else

　　sError=""

　　end if

　　set xmlHttp=nothing

　　end function

　　Public function saveimage(tofile,isoverwrite)

　　on error resume next

　　dim objStream,objFSO,imgs

　　if Not isoverwrite Then

　　Set objFSO = Server.CreateObject("Scripting.FileSystemObject")

　　If objFSO.FileExists(Server.MapPath(tofile)) Then

　　Exit Function

　　End If

　　Set objFSO = Nothing

　　End IF

　　imgs=getBody(sUrl)

　　Set objStream = Server.CreateObject("ADODB.Stream")

　　with objStream

　　.Type =1

　　.Open

　　.write imgs

　　.SaveToFile server.mappath(tofile),2

　　.Close()

　　end with

　　set objstream=nothing

　　end function

　　end class

　　用了这个类文件，做起事情来就方便多了。

　　然后就可以分析采集网站的网页结构，写采集程序了。

　　下面给个例子：

　　server.ScriptTimeout = 1000

　　<html>

　　<head>

　　<title>BT采集器</title>

　　</head>

　　<body>

　　分类ID：

　　开始ID：

　　<br>

　　结束ID：

　　<br>

　　分类名称：<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取

　　<br>

　　</form>

　　当前ID：<%=request("id")%> <br>

　　dim action

　　action = Request("action")

　　if action = "getdata" then

　　cid = Request("cid")

　　startid = Request("startid")

　　overid = Request("overid")

　　id = Request("id")

　　if id = "" then id = startid

　　set objxhttp = new xhttp

　　objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm"

　　content = objxhttp.Html

　　if InStr(content,"网站维护中") then

　　call NextID

　　response.End()

　　end if

list = GetContent(content,"","",0)

　　Dim regEx, Match, Matches,patrn

　　Set regEx = New RegExp

　　patrn = "<a href=""../BtHtml/(.+?)"">"

　　regEx.Pattern = patrn

　　regEx.IgnoreCase = True

　　regEx.Global = True

　　Set Matches = regEx.Execute(list)

　　on error resume next

　　For Each Match in Matches

　　'response.write Match.Value & "<br>"

　　weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1")

　　response.write weburl & "<br>"

　　response.Flush()

　　objxhttp.URL = weburl

　　cpage = objxhttp.Html

cpage = GetContent(cpage,"","",0)

　　title = GetContent(cpage,"BT资源名称：<strong>","</strong>",0)

　　title = stripHTML(title)

　　IF Request("classname") <> "" then

　　classname = Request("classname")

　　Else

　　if InStr(title,"喜剧") then

　　classname = "喜剧"

　　Elseif InStr(title,"动作") then

　　classname = "动作"

　　Elseif InStr(title,"惊悚") then

　　classname = "惊悚"

　　Elseif InStr(title,"犯罪") then

　　classname = "犯罪"

　　Elseif InStr(title,"恐怖") then

　　classname = "恐怖"

　　Elseif InStr(title,"爱情") then

　　classname = "爱情"

　　Elseif InStr(title,"冒险") then

　　classname = "冒险"

　　Elseif InStr(title,"科幻") then

　　classname = "科幻"

　　Elseif InStr(title,"悬念") then

　　classname = "悬念"

　　Elseif InStr(title,"奇幻") then

　　classname = "奇幻"

　　Elseif InStr(title,"战争") then

　　classname = "战争"

　　Elseif InStr(title,"连续剧") then

　　classname = "连续剧"

　　Elseif InStr(title,"综艺") then

　　classname = "综艺"

　　Elseif InStr(title,"灾难") then

　　classname = "灾难"

　　Elseif InStr(title,"伦理") then

　　classname = "伦理"

　　Elseif InStr(title,"动漫") or InStr(title,"动画") then

　　classname = "动漫"

　　Elseif InStr(title,"国语") or InStr(title,"集") then

　　classname = "其他影视"

　　Else

　　classname = "其他"

　　End if

　　End IF

　　intro = GetContent(cpage,"<tr><td width=770 bgcolor=#FFFFFF><div style=""margin:10px;line-height:150%"">","</div>",0)

　　intro = Replace(intro,"<br />","[br]")

　　intro = Replace(intro,"<BR />","[br]")

　　intro = Replace(intro,"<BR>","[br]")

　　intro = Replace(intro,"<br>","[br]")

　　intro = Replace(intro,"<p>","[p]")

　　intro = Replace(intro,"<P>","[p]")

　　intro = Replace(intro,"</p>","[/p]")

　　intro = Replace(intro,"</P>","[p]")

　　intro = Replace(intro,"<img","[img")

　　intro = Replace(intro,"<IMG","[img")

　　intro = stripHTML(intro)

　　intro = Replace(intro,"[br]","<br>")

　　intro = Replace(intro,"[p]","<p>")

　　intro = Replace(intro,"[/p]","</p>")

　　intro = Replace(intro,"[img","<img")

　　intro = Replace(intro,"[img]","<img src=")

　　intro = Replace(intro,"[/img]",">")

　　intro = Replace(intro,"[IMG]","<img src=")

　　intro = Replace(intro,"[/IMG]",">")

　　'response.write t

　　'response.End()

　　addtime = Trim(GetContent(cpage,"发布时间："," ",0))

　　if Not IsDate(addtime) then addtime = now()

　　username = "bt"

　　filesize = GetContent(content,"BT文件大小："," ",0)

　　title2 = title

　　downurl = GetContent(cpage,"<a style=""color:red"" href=""","""",0)

　　p = CDate(addtime)

　　Dim sRnd

　　Randomize

　　sRnd = Int(900 * Rnd) + 100

　　sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & ".torrent"

　　url = "torrent/" & year(p) & "-" & month(p) & "-" & day(p) & "/" & sFileName

　　Call CreateF(url)

　　'Text

　　Response.Write classname & "<br>"

　　Response.write title & "<br>"

　　'response.Write intro & "<br>"

　　'response.Write addtime & "<br>"

　　'response.Write username & "<br>"

　　'response.Write filesize & "<br>"

　　response.Write downurl & "<br>"

　　response.Write url & "<br>"

　　response.Flush()

　　'response.End()

　　'database

　　if err.number = 0 then

　　if (Not IsNull(title)) and title <> "" and downurl <> "" then

　　set rs = server.CreateObject("adodb.recordset")

　　sql = "select * from bt_class where classname = '" & classname & "'"

　　rs.open sql,conn,1,3

　　if rs.eof then

　　rs.addnew

　　rs("classname") = classname

　　rs.update

　　end if

　　classid = rs("classid")

　　rs.close

　　set rs = nothing

　　set rs = server.CreateObject("adodb.recordset")

　　sql = "select * from bt_movie where title in ('" & title & "')"

　　rs.open sql,conn,1,3

　　if rs.eof then

　　response.Write "<div><font color=blue>写入数据库...</font></div>"

　　response.Flush()

　　rs.addnew

　　rs("classid") = classid

　　rs("title") = title

　　rs("title2") = title2

　　rs("intro") = intro

　　rs("username") = username

　　rs("filesize") = filesize

　　rs("url") = url

　　rs("serverid") = 1

　　rs("addtime") = addtime

　　rs("ismake") = 0

　　rs.update

　　objxhttp.URL = downurl

　　objxhttp.saveimage url,False

　　else

　　response.Write "<div><font color=red>已经存在！</font></div>"

　　end if

　　rs.close

　　set rs = nothing

　　'objxhttp.URL = downurl

　　'objxhttp.saveimage url,False

　　End IF

　　Else

　　err.clear

　　End IF

　　response.Write "-------------------------------------------<br>"

　　set regEx = nothing

　　response.Write "下一页<br>"

　　response.Flush()

　　Call NextID()

　　end if

　　Sub NextID

　　conn.close

　　set conn = nothing

　　if cint(startid) < cint(overid) and cint(id) < cint(overid) then

　　response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id + 1 &"'</script>"

　　Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then

　　response.Write "<script>location.href='get81bt.asp?action=getdata&classname=" & Request("classname") & "&cid=" & cid & "&startid=" & startid & "&overid=" & overid & "&id="& id - 1 &"'</script>"

　　Else

　　Response.Write "采集完成！<br>"

　　response.End()

　　End if

　　End Sub

　　</body>

　　</html>

教程首页更多教程