[资料] 贴一段代码,测试用D写爬虫的能力

fxsjy 2007-10-12
import std.stdio;
import std.string;
import std.conv;
import std.socket;
import std.socketstream;
import std.stream;
import std.regexp;
import std.thread;
import std.c.time;

char[][] g_queue;
int g_task_amount=0;
const char[] homepage="http://mobile.younet.com/";
const ushort max_thread=20;

alias std.string.find strfind;

char[] getHTML(char[] url){
	
	char[] domain,html;
	ubyte[1024] buf;
	ushort port=80;
	if(!isURL(url))return null;
	int i=strfind(url,"://")+3;
	url=url[i..$];
	int j=strfind(url,":");
	int e=strfind(url,"/");
	if(e<0){
		e=url.length;
	}
	if(j>0){
		port=toUshort(url[j+1..e]);
		domain=url[0..j];
	}
	else{
		domain=url[0..e];
	}

	if(e==url.length){
		url="/";
	}
	else{
		url=url[e..$];
	}

	debug(younet){
		writefln(toString(port) ~"  "~ domain ~"  "~ url);
	}

	Socket sock=new TcpSocket(new InternetAddress(domain,port));
	Stream ss=new SocketStream(sock);
	ss.writeString("GET " ~ url ~ " HTTP/1.0\r\n"
		 "Host: " ~ domain ~ "\r\n"
		 "Connection: close\r\n"
		 "Referer: http://" ~ domain ~ url ~ "\r\n"
		 "\r\n\r\n\r\n\r\n");
	int recv_amount=ss.read(buf);
	while(recv_amount>0){
		html ~= cast(char[])buf[0..recv_amount];
		recv_amount=ss.read(buf);
	}

	ss.close();
	sock.close();
	
	char[][] mc=RegExp("(URL=|Location: )(.*?)[\"\r]").match(html);
	if(mc.length==3){
		char[] new_location=mc[2];
		html=getHTML(new_location);
		return html;
	}
	int start_pos=strfind(html,"\r\n\r\n") ;
	
	html=html[start_pos+4 .. $];
	return html[0..$];
}

int crawl(void * p){
	while(true){
		char[] url,html;
		synchronized{
			if(g_queue.length==0)
				sleep(1);
			if(g_queue.length==0)
				break;
			url=g_queue[0];
			
			writefln("begin:" ~ url);
			if(g_queue.length>0)
				g_queue=g_queue[1..$];	
		}
		try{
			html=getHTML(url);
		}
		catch(Exception ex){
			synchronized{
				if(g_task_amount>0)
					g_task_amount-=1;
			}
			writefln(ex);
			writefln("failed:" ~ url);
			writefln("remains" ~ toString(g_task_amount));
			continue;
		}
		debug(younet){
			printf(toStringz("!!!" ~ html[0..200]));
		}
		if(strfind(url,"files/list")<0){
			synchronized{
				foreach(m;RegExp("files/list_\\d+\\.html").search(html)){
					g_queue ~= homepage ~ m.match(0);
					g_task_amount+=1;
				}
				g_task_amount-=1;
			}
			writefln("done:" ~ url);
			debug(younet){
				writefln(g_queue);
			}
		}
		else{
			writefln("done:" ~ url);
			synchronized{
				g_task_amount-=1;
				writefln("remains" ~ toString(g_task_amount));
			}
		}
		
	}
	return 1;
}
int main(char[][] args){
	//writefln("Hello");
	g_queue ~= homepage;
	g_task_amount+=1;
	Thread[] tds;
	for(int i=0;i<max_thread;i++){
		Thread t=new Thread(&crawl,null);
		t.start();
		tds ~= t;
	}
	sleep(5);
	while(true){
		sleep(1);
		if(g_task_amount<=0)break;
	}
	return 0;
}
redsea 2007-10-12
这段应该也能用正则搞定吧 ?
    int i=strfind(url,"://")+3;   
    url=url[i..$];   
    int j=strfind(url,":");   
    int e=strfind(url,"/");   
    if(e<0){   
        e=url.length;   
    }   
    if(j>0){   
        port=toUshort(url[j+1..e]);   
        domain=url[0..j];   
    }   
    else{   
        domain=url[0..e];   
    }   
  
    if(e==url.length){   
        url="/";   
    }   
    else{   
        url=url[e..$];   
    }   

fxsjy 2007-10-12
嗯,应该可以
sw2wolf 2007-10-13
C:\WINDOWS.0\system32\cmd.exe /c dmd crawl.d
E:\d\dmd\bin\..\..\dm\bin\link.exe crawl,,,user32+kernel32/noi;
OPTLINK (R) for Win32  Release 8.00.1
Copyright (C) Digital Mars 1989-2004  All rights reserved.
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getprotobyname@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getprotobynumber@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getservbyname@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getservbyport@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSAGetLastError@0
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _gethostbyname@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _gethostbyaddr@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _inet_addr@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _inet_ntoa@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _ioctlsocket@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getsockopt@20
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _bind@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _connect@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _listen@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _accept@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _closesocket@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _shutdown@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getpeername@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getsockname@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _send@16
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _sendto@24
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _recv@16
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _recvfrom@24
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _setsockopt@20
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _socket@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSAStartup@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSACleanup@0
oldrev 2007-10-13
sw2wolf 写道
C:\WINDOWS.0\system32\cmd.exe /c dmd crawl.d
E:\d\dmd\bin\..\..\dm\bin\link.exe crawl,,,user32+kernel32/noi;
OPTLINK (R) for Win32  Release 8.00.1
Copyright (C) Digital Mars 1989-2004  All rights reserved.
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getprotobyname@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getprotobynumber@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getservbyname@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getservbyport@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSAGetLastError@0
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _gethostbyname@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _gethostbyaddr@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _inet_addr@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _inet_ntoa@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _ioctlsocket@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getsockopt@20
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _bind@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _connect@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _listen@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _accept@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _closesocket@4
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _shutdown@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getpeername@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _getsockname@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _send@16
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _sendto@24
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _recv@16
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _recvfrom@24
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _setsockopt@20
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _socket@12
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSAStartup@8
E:\d\dmd\bin\..\lib\phobos.lib(socket)
Error 42: Symbol Undefined _WSACleanup@0

加上 dmd/lib 里的 ws2_32.lib
fxsjy 2007-10-13
dmd ws2_32.lib crawl.d
Global site tag (gtag.js) - Google Analytics