[疑难] D性能与phython比较!

hurd 2009-06-14
按照这个帖子里的要求,写了个D版本的处理程序,在数据不多时(小于300k),性能和py差不多。在百万条左右时,性能和py差很多,各位给看看那里有问题。。。

http://www.iteye.com/topic/377619
import
	tango.core.Array,
	tango.io.Console,
	tango.io.Stdout,
	tango.io.device.File,
	tango.io.device.Array,
	tango.io.stream.Lines,
	tango.io.FilePath,
	tango.text.convert.Format,
	Int	= tango.text.convert.Integer,
	Txt	= tango.text.Util,
	TimS	= tango.text.convert.TimeStamp,
	tango.time.ISO8601,
	tango.time.Time,
	tango.time.Clock;

const i_write_count_of_record = 6000;
const writefiletodir	= "test";
int iSendDelivrd, iRecvDelivrd , inc_value;
enum Mask : uint {
	exists	= 1 ,
	isType4	= 1 << 1 ,
	none		= ~exists,
}
struct X1{
	Mask	 mask;
	uint	phone1, phone2, int1;
	Time tim;
}
X1*[ulong] WaitingList;
struct X2{
	X1* x;
	char[] o;
}
X2*[]	DealList3, DealList4;
Array buf;
alias void function (int,X2*[]) pFn;
Time wite_tim;
void main(){
	scope now	= Clock.now;
	read1;
	read2;
	Stdout.formatln("read:{}ms ", (Clock.now - now).millis );
	
	buf	= new Array(1024* 1024, 1024* 1024);
	scope now2	= Clock.now;
	save(DealList3, &save3);
	Stdout.formatln("save3:{}ms ", (Clock.now - now2).millis );

	now2	= Clock.now;
	inc_value	= 0;
	save(DealList4, &save4);
	Stdout.formatln("save4:{}ms ", (Clock.now - now2).millis );
	
	Stdout.formatln("{}ms 写时间:{} DealList3:{} DealList4:{}", (Clock.now - now).millis,  wite_tim.span.millis ,iSendDelivrd, iRecvDelivrd ).flush;
	Cin.get;
}

void read1(){
	const _file	= "wait-status.csv";
	scope path	= new FilePath(_file);
	scope file	= new File(_file);
	scope lines	= new Lines!(char)(file);
	foreach (_i, line; lines){
		if( line.length < 80 ){
			continue;
		}
		X1* x	= new X1;
		if( line[0] is '4' ){
			x.mask	|=	Mask.isType4;
		}
		line	= line[2..$];
		ulong key	= eat!(ulong)(line);
		
		line	= line[3..$];
		x.phone1	= eat(line);
		
		line	= line[3..$];
		x.phone2	= eat(line);
		x.int1		= eat(line);
		char[] other	= timef(line, x.tim);
		if( x.mask & Mask.isType4 ){
			key	= eat!(ulong)(other);
		}
		WaitingList[key]	= x;
		if( _i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条等待 ", _i,  x.mask);
	}
	Stdout.formatln("`{}` is done \n", _file);
}

void read2(){
	const _file	= "status.csv";
	scope path	= new FilePath(_file);
	scope file	= new File(_file);
	scope lines	= new Lines!(char)(file);
	const delivrd	= r",DELIVRD,";
	const offset	= delivrd.length;
	foreach (_i, line; lines){
		ulong key	=  eat!(ulong)(line);
		auto p	= key in WaitingList;
		if( p is null ){
			continue;
		}
		X1* x	= *p;
		if( (x.mask & Mask.exists) is 0 ){
			continue;
		}
		x.mask	&= Mask.none;
		X2* x2	= new X2;
		x2.x		= x;
		if( line[ 0.. offset] ==  delivrd ){
			if( x.mask & Mask.isType4 ){	
				iRecvDelivrd++;
			}else{	
				iSendDelivrd++;
			}
		}else{
			line	= line[1..$];
			int i	= find(line, ',');
			x2.o	= line[ 0 .. i].dup ;
		}
		if( x.mask & Mask.isType4 ){
			DealList4	~= x2;
		}else{
			DealList3	~= x2;
		}
		if(_i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条状态", _i);
	}
	Stdout.formatln("`{}` is done \n", _file);
}
static T eat(T = int )(ref char[] s){
	if( s.length is 0 ){
		return 0;
	}
	char* p	= &s[0];
	char* li	= &s[$-1];
	while( p < li && (*p < '0' || *p > '9' ) ){
		p++;
	}
	char* _p	= p;
	while( p < li &&  (*p >='0' && *p <= '9') ) {
		p++;
	}
	assert( p <= li , s);
	int delta	= p - &s[0];
	s	= s[delta .. $];
	static if( is(T == char[]) ){
		return  _p[0 .. p - _p];
	}else{
		return  Int.convert!(char)(_p[0 .. p - _p]);
	}
}
char[] timef(char[] s, ref Time tim){
	DateTime dt;
	dt.date.era		= Gregorian.AD_ERA;
	dt.date.year	= eat(s);
	dt.date.month	= eat(s);
	dt.date.day	= eat(s);
	
	dt.time.hours	= eat(s);
	dt.time.minutes	= eat(s);
	dt.time.seconds	= eat(s);
	dt.time.millis	= eat(s);
	tim	= Clock.fromDate(dt);
	return s;
}


void save(X2*[] lst, pFn fn){
	for( int i , j , k ; j < lst.length ;  ){
		j	= (++i) * i_write_count_of_record;
		if( j > lst.length ){
			j	= lst.length;
		}
		fn(i, lst[k .. j ]);
		k	= j;
	}
}

void save3(int index, X2*[] lst){
	char[] _file	= Format("./{}/GMO2008-08-10-08-10-{}.txt", writefiletodir, index);
	Stdout.formatln("3: {} ", _file).flush;
	buf.clear;
	foreach(ref x2; lst){
		format_data("00", "085101", "108511", x2);
		buf("\r\n");
	}
	scope now	= Clock.now;
	File.set(_file, buf.slice );
	wite_tim	+= Clock.now - now;
}
void save4(int index, X2*[] lst){
	char[] _file	= Format("./{}/GMT2008-08-10-08-10-{}.txt", writefiletodir, index);
	Stdout.formatln("4: {}", _file).flush;
	buf.clear;
	foreach(ref x2; lst){
		format_data("00", "085101", "108511", x2);
		buf("\r\n");
	}
	scope now	= Clock.now;
	File.set(_file, buf.slice );
	wite_tim	+= Clock.now - now;
}

void format_data(char[] type ,char[] src_gateway,char[] dest_gateway, X2* x2){
	scope dt	= Clock.toDate(x2.x.tim);
	buf( Format("{:d2}{:d2}{:d2}0008515{:d7}	00",	dt.date.year-2000, dt.date.month, dt.date.day, ++inc_value) );
	buf( Format("	          13{:d9}	          13{:d9}	0	DELI	0	{:d3}	{}  15    	{} ",  x2.x.phone1,  x2.x.phone1,  x2.x.int1 , src_gateway, dest_gateway) );
	char[] time	= Format("{:d4}{:d2}{:d2}{:d2}{:d2}{:d2}", dt.date.year,dt.date.month, dt.date.day, dt.time.hours,dt.time.minutes, dt.time.seconds);
	buf( Format("	13{:d9}	{}  {}",  x2.x.phone2, time, time) );
	//, dt.time.millis

}
ideage 2009-06-14
看了一遍,还没有弄明白。真复杂的说
hurd 2009-06-14
去掉format后,终于跑过py了。在数据量小时是原帖提供的py版本的3倍。
在110万数据时py跑33秒, 下面这个跑25秒多。 加-O -inline -release后是22秒多。
数据量大的时候性能下降的厉害,找不出原因。怀疑和GC有关。
import
	tango.core.Array,
	tango.io.Console,
	tango.io.Stdout,
	tango.io.device.File,
	tango.io.device.Array,
	tango.io.FilePath,
	tango.io.stream.Lines,
	tango.text.convert.Format,
	Int	= tango.text.convert.Integer,
	Txt	= tango.text.Util,
	TimS	= tango.text.convert.TimeStamp,
	tango.time.ISO8601,
	tango.time.Time,
	tango.time.Clock;

const i_write_count_of_record = 6000;
const writefiletodir	= "test";
int iSendDelivrd, iRecvDelivrd;
enum Mask : uint {
	exists	= 1 ,
	isType4	= 1 << 1 ,
	none		= ~exists,
}
struct X1{
	Mask	 mask;
	uint	phone1, phone2, int1;
	DateTime* dt;
	
	char[] year(){
		char[4] tmp;
		return Int.format(tmp, dt.date.year, "d4");
	}
	char[] month(){
		char[2] tmp;
		return Int.format(tmp, dt.date.month, "d2");
	}
	char[] day(){
		char[2] tmp;
		return Int.format(tmp, dt.date.day, "d2");
	}
	char[] hours(){
		char[2] tmp;
		return Int.format(tmp, dt.time.hours, "d2");
	}
	char[] minutes(){
		char[2] tmp;
		return Int.format(tmp, dt.time.minutes, "d2");
	}
	char[] seconds(){
		char[2] tmp;
		return Int.format(tmp, dt.time.seconds, "d2");
	}
	char[] Phone1(){
		char[9] tmp;
		return Int.format(tmp, phone1, "d9");
	}
	char[] Phone2(){
		char[9] tmp;
		return Int.format(tmp, phone2, "d9");
	}
}
X1*[ulong] WaitingList;
struct X2{
	X1* x;
	char[] o;
}
X2*[]	DealList3, DealList4;
Array buf;
Time wite_tim;
void main(){
	scope now	= Clock.now;
	read1;
	read2;
	Stdout.formatln("read:{}ms ", (Clock.now - now).millis );
	
	buf	= new Array(1024* 10240, 1024* 10240);
	save(DealList3, "GMO");
	save(DealList4, "GMT");
	
	Stdout.formatln("{}ms 磁盘io时间:{} DealList3:{} DealList4:{}", (Clock.now - now).millis,  wite_tim.span.millis ,iSendDelivrd, iRecvDelivrd ).flush;
	Cin.get;
}

void read1(){
	const _file	= "wait-status.csv";
	scope path	= new FilePath(_file);
	scope file	= new File(_file);
	scope lines	= new Lines!(char)(file);
	foreach (_i, line; lines){
		if( line.length < 80 ){
			continue;
		}
		X1* x	= new X1;
		if( line[0] is '4' ){
			x.mask	|=	Mask.isType4;
		}
		line	= line[2..$];
		ulong key	= eat!(ulong)(line);
		
		line	= line[3..$];
		x.phone1	= eat(line);
		
		line	= line[3..$];
		x.phone2	= eat(line);
		x.int1		= eat(line);
		char[] other	= timef(line, x.dt);
		if( x.mask & Mask.isType4 ){
			key	= eat!(ulong)(other);
		}
		WaitingList[key]	= x;
		if( _i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条等待 ", _i,  x.mask);
	}
	Stdout.formatln("`{}` is done \n", _file).flush;
	lines.close;
}

void read2(){
	const _file	= "status.csv";
	scope path	= new FilePath(_file);
	scope file	= new File(_file);
	scope lines	= new Lines!(char)(file);
	const delivrd	= r",DELIVRD,";
	const offset	= delivrd.length;
	foreach (_i, line; lines){
		ulong key	=  eat!(ulong)(line);
		auto p	= key in WaitingList;
		if( p is null ){
			continue;
		}
		X1* x	= *p;
		if( (x.mask & Mask.exists) is 0 ){
			continue;
		}
		x.mask	&= Mask.none;
		X2* x2	= new X2;
		x2.x		= x;
		if( line[ 0.. offset] ==  delivrd ){
			if( x.mask & Mask.isType4 ){	
				iRecvDelivrd++;
			}else{	
				iSendDelivrd++;
			}
		}else{
			line	= line[1..$];
			int i	= find(line, ',');
			x2.o	= line[ 0 .. i].dup ;
		}
		if( x.mask & Mask.isType4 ){
			DealList4	~= x2;
		}else{
			DealList3	~= x2;
		}
		if(_i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条状态", _i);
	}
	Stdout.formatln("`{}` is done \n", _file).flush;
	lines.close;
}
static T eat(T = int )(ref char[] s){
	if( s.length is 0 ){
		return 0;
	}
	char* p	= &s[0];
	char* li	= &s[$-1];
	while( p < li && (*p < '0' || *p > '9' ) ){
		p++;
	}
	char* _p	= p;
	while( p < li &&  (*p >='0' && *p <= '9') ) {
		p++;
	}
	assert( p <= li , s);
	int delta	= p - &s[0];
	s	= s[delta .. $];
	static if( is(T == char[]) ){
		return  _p[0 .. p - _p];
	}else{
		return  Int.convert!(char)(_p[0 .. p - _p]);
	}
}
char[] timef(char[] s, ref DateTime* dt){
	dt			= new DateTime;
	dt.date.era		= Gregorian.AD_ERA;
	dt.date.year	= eat(s);
	dt.date.month	= eat(s);
	dt.date.day	= eat(s);
	
	dt.time.hours	= eat(s);
	dt.time.minutes	= eat(s);
	dt.time.seconds	= eat(s);
	dt.time.millis	= eat(s);
	return s;
}

void save(X2*[] lst, char[] path){
	int inc_value	= 0;
	char[20] tmp;
	for( int i , j , k ; j < lst.length ;  ){
		j	= (++i) * i_write_count_of_record;
		if( j > lst.length ){
			j	= lst.length;
		}
		char[] _path	= Format("./{}/{}2008-08-10-08-10-{}.txt", writefiletodir, path, i);
		buf.clear;
		foreach(x2; lst[k .. j ]){
			format_data(x2, Int.format(tmp, ++inc_value, "d7"), "00", "085101", "108511");
		}
		scope now	= Clock.now;
		File.set(_path, buf.slice );
		wite_tim	+= Clock.now - now;
		Stdout(_path)("\n").flush;
		k	= j;
	}
}

void format_data(X2* x2, char[] inc_value , char[] type ,char[] src_gateway,char[] dest_gateway){
	X1 x	= *x2.x;
	char[20] tmp;
	char[] year	= x.year;
	char[] month	= x.month;
	char[] day		= x.day;
	char[] phone1	= x.Phone1;
	char[] hours	= x.hours;
	char[] minutes	= x.minutes;
	char[] seconds	= x.seconds;
	buf(year[2..$])(month)(day)("0008515")(inc_value)("	00	          13")(phone1)
		("	          13")(phone1)("	0	DELI	0	")(  Int.format(tmp,x.int1, "d3") )("	")(src_gateway)
		("  15    	")(dest_gateway)("	13")( x.Phone2 )
		("	")(year)(month)(day)(hours)(minutes)(seconds)
		("	")(year)(month)(day)(hours)(minutes)(seconds)("\n");
}

betty_betty2008 2009-06-14
俺不懂,听人说的:当new 巨量小 object 时,速度:D<C++<Java.其中C++比D 稍快,但Java比C++就快得多多了。

参见(贴子里有beanchmark):
http://www.digitalmars.com/webnews/newsgroups.php?art_group=digitalmars.D.learn&article_id=16769
hurd 2009-06-14
只是new的话,一次申请比多次分开快的多。 但是应用到程序里申请时间节省了好多。在长循环里对各个结构赋值时执行到一定阶段程序就暂停一样,之后变的非常慢。
	X1*[ulong] WaitingList;
	foreach (_i, line; lines){
		X1* x	= new X1;
		//x处理 根据line来赋值
		ulong key = ...;
		WaitingList[key] = x;
	}



一次申请,比上面的哪个慢很多。。。
	X1*[ulong] WaitingList;
	int len	= path.fileSize / 88;
	X1[] list	= new X1[len];
	foreach (_i, line; lines){
		X1* x	= &list[_i];
		//x处理
		ulong key = ...;
		WaitingList[key] = x;
	}
tomqyp 2009-06-14
首先要确定是要比较语言的性能还是要比较库的性能,如果是要比较语言的性能,就不应该把IO操作和业务逻辑操作混在一起计时。

像这样一行行的处理,如果没有预读和缓冲,遇上了大文件大部分操作时间都花在IO上了,相对IO内存分配上的性能优劣几乎可以忽略不记了,这种情况下就算换成汇编我估计也不会有太大的性能提升。
hurd 2009-06-15
在对数组分配方式不同造成性能下降的例子中,改变的只是内存分配,性能在超过50万条数据时下降了几倍。 而在小数据量时和前者差不多。

开始对比py性能差的原因,主要是tango的format造以及缓冲使用不当造成的。
Global site tag (gtag.js) - Google Analytics