[资料] 贴一段代码,测试用D写爬虫的能力
fxsjy
2007-10-12
import std.stdio; import std.string; import std.conv; import std.socket; import std.socketstream; import std.stream; import std.regexp; import std.thread; import std.c.time; char[][] g_queue; int g_task_amount=0; const char[] homepage="http://mobile.younet.com/"; const ushort max_thread=20; alias std.string.find strfind; char[] getHTML(char[] url){ char[] domain,html; ubyte[1024] buf; ushort port=80; if(!isURL(url))return null; int i=strfind(url,"://")+3; url=url[i..$]; int j=strfind(url,":"); int e=strfind(url,"/"); if(e<0){ e=url.length; } if(j>0){ port=toUshort(url[j+1..e]); domain=url[0..j]; } else{ domain=url[0..e]; } if(e==url.length){ url="/"; } else{ url=url[e..$]; } debug(younet){ writefln(toString(port) ~" "~ domain ~" "~ url); } Socket sock=new TcpSocket(new InternetAddress(domain,port)); Stream ss=new SocketStream(sock); ss.writeString("GET " ~ url ~ " HTTP/1.0\r\n" "Host: " ~ domain ~ "\r\n" "Connection: close\r\n" "Referer: http://" ~ domain ~ url ~ "\r\n" "\r\n\r\n\r\n\r\n"); int recv_amount=ss.read(buf); while(recv_amount>0){ html ~= cast(char[])buf[0..recv_amount]; recv_amount=ss.read(buf); } ss.close(); sock.close(); char[][] mc=RegExp("(URL=|Location: )(.*?)[\"\r]").match(html); if(mc.length==3){ char[] new_location=mc[2]; html=getHTML(new_location); return html; } int start_pos=strfind(html,"\r\n\r\n") ; html=html[start_pos+4 .. $]; return html[0..$]; } int crawl(void * p){ while(true){ char[] url,html; synchronized{ if(g_queue.length==0) sleep(1); if(g_queue.length==0) break; url=g_queue[0]; writefln("begin:" ~ url); if(g_queue.length>0) g_queue=g_queue[1..$]; } try{ html=getHTML(url); } catch(Exception ex){ synchronized{ if(g_task_amount>0) g_task_amount-=1; } writefln(ex); writefln("failed:" ~ url); writefln("remains" ~ toString(g_task_amount)); continue; } debug(younet){ printf(toStringz("!!!" ~ html[0..200])); } if(strfind(url,"files/list")<0){ synchronized{ foreach(m;RegExp("files/list_\\d+\\.html").search(html)){ g_queue ~= homepage ~ m.match(0); g_task_amount+=1; } g_task_amount-=1; } writefln("done:" ~ url); debug(younet){ writefln(g_queue); } } else{ writefln("done:" ~ url); synchronized{ g_task_amount-=1; writefln("remains" ~ toString(g_task_amount)); } } } return 1; } int main(char[][] args){ //writefln("Hello"); g_queue ~= homepage; g_task_amount+=1; Thread[] tds; for(int i=0;i<max_thread;i++){ Thread t=new Thread(&crawl,null); t.start(); tds ~= t; } sleep(5); while(true){ sleep(1); if(g_task_amount<=0)break; } return 0; } |
|
redsea
2007-10-12
这段应该也能用正则搞定吧 ?
int i=strfind(url,"://")+3; url=url[i..$]; int j=strfind(url,":"); int e=strfind(url,"/"); if(e<0){ e=url.length; } if(j>0){ port=toUshort(url[j+1..e]); domain=url[0..j]; } else{ domain=url[0..e]; } if(e==url.length){ url="/"; } else{ url=url[e..$]; } |
|
fxsjy
2007-10-12
嗯,应该可以
|
|
sw2wolf
2007-10-13
C:\WINDOWS.0\system32\cmd.exe /c dmd crawl.d
E:\d\dmd\bin\..\..\dm\bin\link.exe crawl,,,user32+kernel32/noi; OPTLINK (R) for Win32 Release 8.00.1 Copyright (C) Digital Mars 1989-2004 All rights reserved. E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getprotobyname@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getprotobynumber@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getservbyname@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getservbyport@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSAGetLastError@0 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _gethostbyname@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _gethostbyaddr@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _inet_addr@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _inet_ntoa@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _ioctlsocket@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getsockopt@20 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _bind@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _connect@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _listen@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _accept@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _closesocket@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _shutdown@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getpeername@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getsockname@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _send@16 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _sendto@24 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _recv@16 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _recvfrom@24 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _setsockopt@20 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _socket@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSAStartup@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSACleanup@0 |
|
oldrev
2007-10-13
sw2wolf 写道 C:\WINDOWS.0\system32\cmd.exe /c dmd crawl.d
E:\d\dmd\bin\..\..\dm\bin\link.exe crawl,,,user32+kernel32/noi; OPTLINK (R) for Win32 Release 8.00.1 Copyright (C) Digital Mars 1989-2004 All rights reserved. E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getprotobyname@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getprotobynumber@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getservbyname@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getservbyport@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSAGetLastError@0 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _gethostbyname@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _gethostbyaddr@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _inet_addr@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _inet_ntoa@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _ioctlsocket@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getsockopt@20 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _bind@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _connect@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _listen@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _accept@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _closesocket@4 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _shutdown@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getpeername@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _getsockname@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _send@16 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _sendto@24 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _recv@16 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _recvfrom@24 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _setsockopt@20 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _socket@12 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSAStartup@8 E:\d\dmd\bin\..\lib\phobos.lib(socket) Error 42: Symbol Undefined _WSACleanup@0 加上 dmd/lib 里的 ws2_32.lib |
|
fxsjy
2007-10-13
dmd ws2_32.lib crawl.d
|
相关讨论
相关资源推荐
- vs使用Installshield创建安装程序的问题
- InstallShield用于配置 IIS 的 InstallScrip
- installshield 安装mysql数据库_使用 InstallShield 安装和卸载SQL Server 数据库
- [开发手记]使用 InstallShield 安装和卸载SQL Server 数据库
- 使用 InstallShield 安装和卸载SQL Server 数据库[收藏]
- 使用InstallShield
- Installshield使用的几个经验
- Installshield2010 实现web部署和数据库安装示例
- installshield 4075 错误
- VS2012程序打包部署详解