Skynet

---------- ---------- 我的新 blog : liukaiyi.cublog.cn ---------- ----------

BlogJava :: 首頁 :: 聯系 :: 聚合

:: 管理

112 Posts :: 1 Stories :: 49 Comments :: 0 Trackbacks

k-means （python）算法

轉:http://www.daniweb.com/forums/thread31449.html
什么都不說了，直接看代碼吧。
注解應該寫的比較詳細

# liukaiyi
# 注 k-means ，維度類型 - 數值形式 ( 199 或 23.13

)
import sys, math, random

# -- 類化 '數據'
# 在 n-維度空間
class Point:
    def __init__(self, coords, reference=None):
        self.coords = coords
        self.n = len(coords)
        self.reference = reference
    def __repr__(self):
        return str(self.coords)

# -- 類化 '聚集點 / 聚類平均距離點 '
# -- 在 n-維度空間
# -- k-means 核心類
# -- 每次聚集各點圍繞她進行聚集
# -- 并提供方法求-聚集后的計算中心點，同時記入此次中心點(聚集各點平均距離)，為下一次聚集提供中心點.
class Cluster:
    def __init__(self, points):
        if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
        self.points = points
        self.n = points[0].n
    for p in points:
            if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
        # 求聚集各點后平均點
    self.centroid = self.calculateCentroid()
    def __repr__(self):
        return str(self.points)

    # 更新中心點，并返回原中心點與現中心點(聚集各點平均距離)距離
    def update(self, points):
        old_centroid = self.centroid
        self.points = points
        self.centroid = self.calculateCentroid()
        return getDistance(old_centroid, self.centroid)

    # 計算平均點（聚集/收集各點（離本類的中心點）最近數據,后生成新的中心點）
    def calculateCentroid(self):
        centroid_coords = []
        #  維度迭代
    for i in range(self.n):
            centroid_coords.append(0.0)
            # 收集各點迭代
        for p in self.points:
                centroid_coords[i] = centroid_coords[i]+p.coords[i]
            centroid_coords[i] = centroid_coords[i]/len(self.points)
        return Point(centroid_coords)

# -- 返回根據 k-means 聚集形成的數據集
def kmeans(points, k, cutoff):
    # Randomly sample k Points from the points list, build Clusters around them
    initial = random.sample(points, k)
    clusters = []
    for p in initial: clusters.append(Cluster([p]))
    # 迭代 k-means 直到每次迭代各收集點別的最多不超過 0.5
    while True:
        #  k 個收集數組
        lists = []
        for c in clusters: lists.append([])
    # 迭代每個數據點，并計算與每個中心點距離
    # 并把數據點添加入相應最短的中心點收集數組中
    # 在迭代中 smallest_distance 為每個點與各中心點最短距離參數，請注意看
        for p in points:
            smallest_distance = getDistance(p, clusters[0].centroid)
            index = 0
            for i in range(len(clusters[1:])):
                distance = getDistance(p, clusters[i+1].centroid)
                if distance < smallest_distance:
                    smallest_distance = distance
                    index = i+1
            # 添加到離最短中心距離的數組中
        lists[index].append(p)

        # 聚集完，計算新中心點
    # 并 cluster.centroid 屬性記入下新中心點（下一次聚集的中心點）
    # 并計算與上一次中心點距離，如果差值在 cutoff 0.5 以下 ,跳出迭代（結束，返回最后一次聚集集合）
    biggest_shift = 0.0
        for i in range(len(clusters)):
            shift = clusters[i].update(lists[i])
            biggest_shift = max(biggest_shift, shift)
        if biggest_shift < cutoff: break
    return clusters

# -- 得到歐幾里德距離兩點之間
def getDistance(a, b):
    # Forbid measurements between Points in different spaces
    if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
    # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
    ret = 0.0
    for i in range(a.n):
        ret = ret+pow((a.coords[i]-b.coords[i]), 2)
    return math.sqrt(ret)

# -- 在 n-維度空間中創(chuàng)建隨機點
# -- 隨機生成測試數據
def makeRandomPoint(n, lower, upper):
    coords = []
    for i in range(n): coords.append(random.uniform(lower, upper))
    return Point(coords)

# main
def main(args):
    # 參數說明
    # num_points,    n,    k,      cutoff,         lower,        upper
    # 隨機數據數量 , 維度, 聚集數, 跳出迭代最小距離 ,   維度數最大值,維度數最小值
    num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200

    # 在 n-維度空間里 , 創(chuàng)建 num_points 隨機點
    # 測試數據生成
    points = []
    for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))

    # 使用 k-means 算法，來聚集數據點 (算法入口點)
    clusters = kmeans(points, k, cutoff)

    print "\nPOINTS:"
    for p in points: print "P:", p
    print "\nCLUSTERS:"
    for c in clusters: print "C:", c
if __name__ == "__main__": main(sys.argv)

整理 www.aygfsteel.com/Good-Game

posted on 2009-08-07 16:20 劉凱毅閱讀(2072) 評論(0) 編輯收藏所屬分類: python 、算法/函數

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發(fā)表評論。




網站導航: 博客園 IT新聞 Chat2DB C++博客博問管理
相關文章: MoinMoin wiki 服務器搭建與嘗試給自己的圖片處理工具 (py2exe) 跟我一起學 - 算法導論 - 快速排序 python pil 使用(轉) shell txt 分析小結跟我一起學 - 算法導論 - 遞歸式理解高斯函數，以及在推薦算法中的應用跟我一起學 - 算法導論 - 插入排序文件存儲 - 數據結構( py ) beanstalkd 消息隊列的第一手資料

Skynet

常用鏈接

留言簿(13)

我參與的團隊

隨筆分類

隨筆檔案

相冊

搜索

最新評論

閱讀排行榜

評論排行榜