[疑难] D性能与phython比较!
hurd
2009-06-14
按照这个帖子里的要求,写了个D版本的处理程序,在数据不多时(小于300k),性能和py差不多。在百万条左右时,性能和py差很多,各位给看看那里有问题。。。
http://www.iteye.com/topic/377619 import tango.core.Array, tango.io.Console, tango.io.Stdout, tango.io.device.File, tango.io.device.Array, tango.io.stream.Lines, tango.io.FilePath, tango.text.convert.Format, Int = tango.text.convert.Integer, Txt = tango.text.Util, TimS = tango.text.convert.TimeStamp, tango.time.ISO8601, tango.time.Time, tango.time.Clock; const i_write_count_of_record = 6000; const writefiletodir = "test"; int iSendDelivrd, iRecvDelivrd , inc_value; enum Mask : uint { exists = 1 , isType4 = 1 << 1 , none = ~exists, } struct X1{ Mask mask; uint phone1, phone2, int1; Time tim; } X1*[ulong] WaitingList; struct X2{ X1* x; char[] o; } X2*[] DealList3, DealList4; Array buf; alias void function (int,X2*[]) pFn; Time wite_tim; void main(){ scope now = Clock.now; read1; read2; Stdout.formatln("read:{}ms ", (Clock.now - now).millis ); buf = new Array(1024* 1024, 1024* 1024); scope now2 = Clock.now; save(DealList3, &save3); Stdout.formatln("save3:{}ms ", (Clock.now - now2).millis ); now2 = Clock.now; inc_value = 0; save(DealList4, &save4); Stdout.formatln("save4:{}ms ", (Clock.now - now2).millis ); Stdout.formatln("{}ms 写时间:{} DealList3:{} DealList4:{}", (Clock.now - now).millis, wite_tim.span.millis ,iSendDelivrd, iRecvDelivrd ).flush; Cin.get; } void read1(){ const _file = "wait-status.csv"; scope path = new FilePath(_file); scope file = new File(_file); scope lines = new Lines!(char)(file); foreach (_i, line; lines){ if( line.length < 80 ){ continue; } X1* x = new X1; if( line[0] is '4' ){ x.mask |= Mask.isType4; } line = line[2..$]; ulong key = eat!(ulong)(line); line = line[3..$]; x.phone1 = eat(line); line = line[3..$]; x.phone2 = eat(line); x.int1 = eat(line); char[] other = timef(line, x.tim); if( x.mask & Mask.isType4 ){ key = eat!(ulong)(other); } WaitingList[key] = x; if( _i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条等待 ", _i, x.mask); } Stdout.formatln("`{}` is done \n", _file); } void read2(){ const _file = "status.csv"; scope path = new FilePath(_file); scope file = new File(_file); scope lines = new Lines!(char)(file); const delivrd = r",DELIVRD,"; const offset = delivrd.length; foreach (_i, line; lines){ ulong key = eat!(ulong)(line); auto p = key in WaitingList; if( p is null ){ continue; } X1* x = *p; if( (x.mask & Mask.exists) is 0 ){ continue; } x.mask &= Mask.none; X2* x2 = new X2; x2.x = x; if( line[ 0.. offset] == delivrd ){ if( x.mask & Mask.isType4 ){ iRecvDelivrd++; }else{ iSendDelivrd++; } }else{ line = line[1..$]; int i = find(line, ','); x2.o = line[ 0 .. i].dup ; } if( x.mask & Mask.isType4 ){ DealList4 ~= x2; }else{ DealList3 ~= x2; } if(_i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条状态", _i); } Stdout.formatln("`{}` is done \n", _file); } static T eat(T = int )(ref char[] s){ if( s.length is 0 ){ return 0; } char* p = &s[0]; char* li = &s[$-1]; while( p < li && (*p < '0' || *p > '9' ) ){ p++; } char* _p = p; while( p < li && (*p >='0' && *p <= '9') ) { p++; } assert( p <= li , s); int delta = p - &s[0]; s = s[delta .. $]; static if( is(T == char[]) ){ return _p[0 .. p - _p]; }else{ return Int.convert!(char)(_p[0 .. p - _p]); } } char[] timef(char[] s, ref Time tim){ DateTime dt; dt.date.era = Gregorian.AD_ERA; dt.date.year = eat(s); dt.date.month = eat(s); dt.date.day = eat(s); dt.time.hours = eat(s); dt.time.minutes = eat(s); dt.time.seconds = eat(s); dt.time.millis = eat(s); tim = Clock.fromDate(dt); return s; } void save(X2*[] lst, pFn fn){ for( int i , j , k ; j < lst.length ; ){ j = (++i) * i_write_count_of_record; if( j > lst.length ){ j = lst.length; } fn(i, lst[k .. j ]); k = j; } } void save3(int index, X2*[] lst){ char[] _file = Format("./{}/GMO2008-08-10-08-10-{}.txt", writefiletodir, index); Stdout.formatln("3: {} ", _file).flush; buf.clear; foreach(ref x2; lst){ format_data("00", "085101", "108511", x2); buf("\r\n"); } scope now = Clock.now; File.set(_file, buf.slice ); wite_tim += Clock.now - now; } void save4(int index, X2*[] lst){ char[] _file = Format("./{}/GMT2008-08-10-08-10-{}.txt", writefiletodir, index); Stdout.formatln("4: {}", _file).flush; buf.clear; foreach(ref x2; lst){ format_data("00", "085101", "108511", x2); buf("\r\n"); } scope now = Clock.now; File.set(_file, buf.slice ); wite_tim += Clock.now - now; } void format_data(char[] type ,char[] src_gateway,char[] dest_gateway, X2* x2){ scope dt = Clock.toDate(x2.x.tim); buf( Format("{:d2}{:d2}{:d2}0008515{:d7} 00", dt.date.year-2000, dt.date.month, dt.date.day, ++inc_value) ); buf( Format(" 13{:d9} 13{:d9} 0 DELI 0 {:d3} {} 15 {} ", x2.x.phone1, x2.x.phone1, x2.x.int1 , src_gateway, dest_gateway) ); char[] time = Format("{:d4}{:d2}{:d2}{:d2}{:d2}{:d2}", dt.date.year,dt.date.month, dt.date.day, dt.time.hours,dt.time.minutes, dt.time.seconds); buf( Format(" 13{:d9} {} {}", x2.x.phone2, time, time) ); //, dt.time.millis } |
|
ideage
2009-06-14
看了一遍,还没有弄明白。真复杂的说
|
|
hurd
2009-06-14
去掉format后,终于跑过py了。在数据量小时是原帖提供的py版本的3倍。
在110万数据时py跑33秒, 下面这个跑25秒多。 加-O -inline -release后是22秒多。 数据量大的时候性能下降的厉害,找不出原因。怀疑和GC有关。 import tango.core.Array, tango.io.Console, tango.io.Stdout, tango.io.device.File, tango.io.device.Array, tango.io.FilePath, tango.io.stream.Lines, tango.text.convert.Format, Int = tango.text.convert.Integer, Txt = tango.text.Util, TimS = tango.text.convert.TimeStamp, tango.time.ISO8601, tango.time.Time, tango.time.Clock; const i_write_count_of_record = 6000; const writefiletodir = "test"; int iSendDelivrd, iRecvDelivrd; enum Mask : uint { exists = 1 , isType4 = 1 << 1 , none = ~exists, } struct X1{ Mask mask; uint phone1, phone2, int1; DateTime* dt; char[] year(){ char[4] tmp; return Int.format(tmp, dt.date.year, "d4"); } char[] month(){ char[2] tmp; return Int.format(tmp, dt.date.month, "d2"); } char[] day(){ char[2] tmp; return Int.format(tmp, dt.date.day, "d2"); } char[] hours(){ char[2] tmp; return Int.format(tmp, dt.time.hours, "d2"); } char[] minutes(){ char[2] tmp; return Int.format(tmp, dt.time.minutes, "d2"); } char[] seconds(){ char[2] tmp; return Int.format(tmp, dt.time.seconds, "d2"); } char[] Phone1(){ char[9] tmp; return Int.format(tmp, phone1, "d9"); } char[] Phone2(){ char[9] tmp; return Int.format(tmp, phone2, "d9"); } } X1*[ulong] WaitingList; struct X2{ X1* x; char[] o; } X2*[] DealList3, DealList4; Array buf; Time wite_tim; void main(){ scope now = Clock.now; read1; read2; Stdout.formatln("read:{}ms ", (Clock.now - now).millis ); buf = new Array(1024* 10240, 1024* 10240); save(DealList3, "GMO"); save(DealList4, "GMT"); Stdout.formatln("{}ms 磁盘io时间:{} DealList3:{} DealList4:{}", (Clock.now - now).millis, wite_tim.span.millis ,iSendDelivrd, iRecvDelivrd ).flush; Cin.get; } void read1(){ const _file = "wait-status.csv"; scope path = new FilePath(_file); scope file = new File(_file); scope lines = new Lines!(char)(file); foreach (_i, line; lines){ if( line.length < 80 ){ continue; } X1* x = new X1; if( line[0] is '4' ){ x.mask |= Mask.isType4; } line = line[2..$]; ulong key = eat!(ulong)(line); line = line[3..$]; x.phone1 = eat(line); line = line[3..$]; x.phone2 = eat(line); x.int1 = eat(line); char[] other = timef(line, x.dt); if( x.mask & Mask.isType4 ){ key = eat!(ulong)(other); } WaitingList[key] = x; if( _i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条等待 ", _i, x.mask); } Stdout.formatln("`{}` is done \n", _file).flush; lines.close; } void read2(){ const _file = "status.csv"; scope path = new FilePath(_file); scope file = new File(_file); scope lines = new Lines!(char)(file); const delivrd = r",DELIVRD,"; const offset = delivrd.length; foreach (_i, line; lines){ ulong key = eat!(ulong)(line); auto p = key in WaitingList; if( p is null ){ continue; } X1* x = *p; if( (x.mask & Mask.exists) is 0 ){ continue; } x.mask &= Mask.none; X2* x2 = new X2; x2.x = x; if( line[ 0.. offset] == delivrd ){ if( x.mask & Mask.isType4 ){ iRecvDelivrd++; }else{ iSendDelivrd++; } }else{ line = line[1..$]; int i = find(line, ','); x2.o = line[ 0 .. i].dup ; } if( x.mask & Mask.isType4 ){ DealList4 ~= x2; }else{ DealList3 ~= x2; } if(_i && _i % i_write_count_of_record is 0) Stdout.formatln("{:d7} 条状态", _i); } Stdout.formatln("`{}` is done \n", _file).flush; lines.close; } static T eat(T = int )(ref char[] s){ if( s.length is 0 ){ return 0; } char* p = &s[0]; char* li = &s[$-1]; while( p < li && (*p < '0' || *p > '9' ) ){ p++; } char* _p = p; while( p < li && (*p >='0' && *p <= '9') ) { p++; } assert( p <= li , s); int delta = p - &s[0]; s = s[delta .. $]; static if( is(T == char[]) ){ return _p[0 .. p - _p]; }else{ return Int.convert!(char)(_p[0 .. p - _p]); } } char[] timef(char[] s, ref DateTime* dt){ dt = new DateTime; dt.date.era = Gregorian.AD_ERA; dt.date.year = eat(s); dt.date.month = eat(s); dt.date.day = eat(s); dt.time.hours = eat(s); dt.time.minutes = eat(s); dt.time.seconds = eat(s); dt.time.millis = eat(s); return s; } void save(X2*[] lst, char[] path){ int inc_value = 0; char[20] tmp; for( int i , j , k ; j < lst.length ; ){ j = (++i) * i_write_count_of_record; if( j > lst.length ){ j = lst.length; } char[] _path = Format("./{}/{}2008-08-10-08-10-{}.txt", writefiletodir, path, i); buf.clear; foreach(x2; lst[k .. j ]){ format_data(x2, Int.format(tmp, ++inc_value, "d7"), "00", "085101", "108511"); } scope now = Clock.now; File.set(_path, buf.slice ); wite_tim += Clock.now - now; Stdout(_path)("\n").flush; k = j; } } void format_data(X2* x2, char[] inc_value , char[] type ,char[] src_gateway,char[] dest_gateway){ X1 x = *x2.x; char[20] tmp; char[] year = x.year; char[] month = x.month; char[] day = x.day; char[] phone1 = x.Phone1; char[] hours = x.hours; char[] minutes = x.minutes; char[] seconds = x.seconds; buf(year[2..$])(month)(day)("0008515")(inc_value)(" 00 13")(phone1) (" 13")(phone1)(" 0 DELI 0 ")( Int.format(tmp,x.int1, "d3") )(" ")(src_gateway) (" 15 ")(dest_gateway)(" 13")( x.Phone2 ) (" ")(year)(month)(day)(hours)(minutes)(seconds) (" ")(year)(month)(day)(hours)(minutes)(seconds)("\n"); } |
|
betty_betty2008
2009-06-14
俺不懂,听人说的:当new 巨量小 object 时,速度:D<C++<Java.其中C++比D 稍快,但Java比C++就快得多多了。
参见(贴子里有beanchmark): http://www.digitalmars.com/webnews/newsgroups.php?art_group=digitalmars.D.learn&article_id=16769 |
|
hurd
2009-06-14
只是new的话,一次申请比多次分开快的多。 但是应用到程序里申请时间节省了好多。在长循环里对各个结构赋值时执行到一定阶段程序就暂停一样,之后变的非常慢。
X1*[ulong] WaitingList; foreach (_i, line; lines){ X1* x = new X1; //x处理 根据line来赋值 ulong key = ...; WaitingList[key] = x; } 一次申请,比上面的哪个慢很多。。。 X1*[ulong] WaitingList; int len = path.fileSize / 88; X1[] list = new X1[len]; foreach (_i, line; lines){ X1* x = &list[_i]; //x处理 ulong key = ...; WaitingList[key] = x; } |
|
tomqyp
2009-06-14
首先要确定是要比较语言的性能还是要比较库的性能,如果是要比较语言的性能,就不应该把IO操作和业务逻辑操作混在一起计时。
像这样一行行的处理,如果没有预读和缓冲,遇上了大文件大部分操作时间都花在IO上了,相对IO内存分配上的性能优劣几乎可以忽略不记了,这种情况下就算换成汇编我估计也不会有太大的性能提升。 |
|
hurd
2009-06-15
在对数组分配方式不同造成性能下降的例子中,改变的只是内存分配,性能在超过50万条数据时下降了几倍。 而在小数据量时和前者差不多。
开始对比py性能差的原因,主要是tango的format造以及缓冲使用不当造成的。 |
相关讨论
相关资源推荐
- ADO实现单条记录的刷新
- Delphi中ado+datasetprovider+clientdataset控件,执行更新clientdataset.applyupdates(0) 多表 更新单表 解决方法
- 深入分析ADO.NET中的DataSet对象
- ADO API 参考( ADO方法)之四
- ADO API 参考( ADO方法)之一
- AdoDataSet.RecordSet的串行化和还原
- ADO三大对象的属性、方法、事件及常数(三)
- java ado recordset_AdoDataSet.RecordSet的串行化和还原(转载)
- ADO连接各种数据库的基本方法
- ADO API 参考( ADO 动态属性)