數(shù)據(jù)說明:
knnuu_...txt 文件大小 3.2G 數(shù)據(jù)格式是
user1 user2 score
..
usern userm score
我這里希望通過清洗得到:
與 user1 關(guān)系最近的 top 100 人
由于數(shù)據(jù)并非需要百分之百準(zhǔn)確,我放棄在分隔出的數(shù)據(jù)
if len(dr)!=3 : continue
開了 7 個(gè)線程 也就是 會(huì)有 7 個(gè) 用戶 的 uid 對(duì) top 100 uid 會(huì)出現(xiàn)問題
對(duì)應(yīng) 總用戶數(shù)幾十萬(wàn)來說 呵呵 ! 我就用這 完善7個(gè)特殊人的列表時(shí)間寫個(gè) blog 吧
并結(jié)合 linux split , awk 等 快速實(shí)現(xiàn)的 猥瑣 多線程 哈哈!!
怎么修改下 速度提升 5倍,原來的 一小時(shí) 到 10多分鐘 。。。。。
# split --bytes=500m knnuu_20091123.txt knnuu/
# ls a* | awk '{system( " python uu.py "$0" & " )}'
import bsddb,sys
db = bsddb.hashopen('../id-item-y-09-10-11.db','c')
uid = -1
arr=[]
arrsc=[]
fw = open('tc/'+sys.argv[1]+'uid-uid-sc.txt','w')
ii=0
def insertion_sort(arr,arrsc,uid,sc):
ls = min(100,len(arrsc))
if ls!=0 and sc < arrsc[ls-1] : return
for i in xrange(ls):
if arrsc[i]<=sc :
arrsc.insert(i,sc)
arr.insert(i,uid)
return
elif arrsc[i] > sc : continue
if ls < 99 :
arr.append(uid)
arrsc.append(sc)
#for row in open('knnuu_20091123.txt') :
for row in open(sys.argv[1]):
dr = row.split('\n')[0].split('\t')
if len(dr)!=3 : continue
u1,u2,strsc = dr[0],dr[1],dr[2]
sc = float(strsc)
if uid == -1 : uid = u1
if u1 != uid :
for c in xrange( min(100,len(arrsc)) ):
tu = arr[c]
ts = arrsc[c]
print >>fw,"%s\t%s\t%s" % ( db[u1],db[tu],ts )
print uid
fw.flush()
arr=[u1]
arrsc=[sc]
uid=u1
else :
insertion_sort(arr,arrsc,u2,sc)
ii+=1
#print ii,u1,uid,u2,strsc,len(arr),len(arrsc)
#if ii>10 : break
fw.close()
# ls a* | awk '{system( " python uu.py "$0" & " )}'
import bsddb,sys
db = bsddb.hashopen('../id-item-y-09-10-11.db','c')
uid = -1
arr=[]
arrsc=[]
fw = open('tc/'+sys.argv[1]+'uid-uid-sc.txt','w')
ii=0
def insertion_sort(arr,arrsc,uid,sc):
ls = min(100,len(arrsc))
if ls!=0 and sc < arrsc[ls-1] : return
for i in xrange(ls):
if arrsc[i]<=sc :
arrsc.insert(i,sc)
arr.insert(i,uid)
return
elif arrsc[i] > sc : continue
if ls < 99 :
arr.append(uid)
arrsc.append(sc)
#for row in open('knnuu_20091123.txt') :
for row in open(sys.argv[1]):
dr = row.split('\n')[0].split('\t')
if len(dr)!=3 : continue
u1,u2,strsc = dr[0],dr[1],dr[2]
sc = float(strsc)
if uid == -1 : uid = u1
if u1 != uid :
for c in xrange( min(100,len(arrsc)) ):
tu = arr[c]
ts = arrsc[c]
print >>fw,"%s\t%s\t%s" % ( db[u1],db[tu],ts )
print uid
fw.flush()
arr=[u1]
arrsc=[sc]
uid=u1
else :
insertion_sort(arr,arrsc,u2,sc)
ii+=1
#print ii,u1,uid,u2,strsc,len(arr),len(arrsc)
#if ii>10 : break
fw.close()
整理 www.aygfsteel.com/Good-Game