Change Dir

先知cd——熱愛(ài)生活是一切藝術(shù)的開始

導(dǎo)航

<

2010年11月

>

日

一

二

三

四

五

六

31

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

1

2

3

4

5

6

7

8

9

10

11

公告

寫下來(lái)的都是資源，分享給互聯(lián)網(wǎng)~~均屬原創(chuàng)隨筆。
轉(zhuǎn)載引用請(qǐng)注明作者changedi。
喜歡應(yīng)用研究，熱愛(ài)編程，歡迎交流。

隨筆分類(125)

隨筆檔案(123)

統(tǒng)計(jì)

隨筆 - 222
文章 - 0
評(píng)論 - 182
引用 - 0

留言簿(18)

積分與排名

積分 - 421898
排名 - 132

“?！眰兊牟┛?/h3>

COS

DBA Notes

IDMer

Java之BlueDavy

Java之Denniis

Java之放翁

設(shè)計(jì)模式

閱讀排行榜

評(píng)論排行榜

weka的java使用(1)——聚類

weka是著名的數(shù)據(jù)挖掘工具，在這里有詳細(xì)介紹，IDMer老師的博客里也有比較詳細(xì)的用法描述。當(dāng)然，如果直接使用weka的工具，自然沒(méi)有問(wèn)題，但是如果想用weka的功能在自己的平臺(tái)框架中呢？我這里放出一個(gè)當(dāng)初對(duì)weka的源碼學(xué)習(xí)過(guò)程，主要是如何調(diào)用weka的api。僅供參考，代碼中有什么問(wèn)題，歡迎郵件聯(lián)系。
這里簡(jiǎn)單講解一下流程。構(gòu)造方法首先載入一個(gè)arff文件，然后調(diào)用doCluster（）方法進(jìn)行聚類。
本文用到的arff文件是weka的data目錄里自帶的標(biāo)準(zhǔn)數(shù)據(jù)集。主要格式如下：

1

/**
2

*
3

*/
4

package edu.tju.ikse.mi.util;
5

6

import java.io.File;
7

import java.io.FileNotFoundException;
8

import java.io.IOException;
9

import java.io.PrintWriter;
10

import java.util.Scanner;
11

12

import edu.tju.ikse.mi.anno.util.CfUtil;
13

14

15

import weka.clusterers.XMeans;
16

import weka.core.Instances;
17

import weka.core.converters.ArffLoader;
18

19

/**
20

* @author Jia Yu
21

* @date 2010-5-28
22

*/
23

public class WekaCluster {
24

25

/**
26

* @param args
27

*/
28

29

private ArffLoader loader;
30

private Instances dataSet;
31

private weka.clusterers.Clusterer cluster;
32

private int numOfClusters;
33

private String newAttribute;
34

private File arffFile;
35

private int sizeOfDataset;
36

37

public WekaCluster(File arffFile) {
38

this.arffFile = arffFile;
39

doCluster();
40

}
41

42

private void doCluster() {
43

loader = new ArffLoader();
44

newAttribute = "";
45

try {
46

loader.setFile(arffFile);
47

dataSet = loader.getDataSet();
48

cluster = new XMeans();
49

cluster.buildClusterer(dataSet);
50

numOfClusters = cluster.numberOfClusters();
51

StringBuilder sb = new StringBuilder();
52

for (int i = 0; i < numOfClusters; i++) {
53

sb.append("s" + (i + 1) + " ");
54

}
55

newAttribute = sb.toString().trim();
56

sizeOfDataset = dataSet.numInstances();
57

} catch (Exception e) {
58

e.printStackTrace();
59

}
60

}
61

62

public void newArffWriter() {
63

int lineNum = 0;
64

try {
65

Scanner input = new Scanner(arffFile);
66

PrintWriter out = new PrintWriter(CfUtil
67

.GetFileNameNoExtFromFileName(arffFile.getName())
68

+ "_classification.arff");
69

70

while (input.hasNext()) {
71

String line = input.nextLine();
72

if (line.startsWith("@relation")) {
73

out.println("@relation" + line.substring(9)
74

+ "_classification");
75

} else if (line.startsWith("@data")) {
76

out.println("@attribute shape {" + newAttribute + "}");
77

out.println("@data");
78

} else if (line.startsWith("@attribute")) {
79

out.println(line);
80

} else if (line.isEmpty()) {
81

out.println();
82

} else {
83

line += ",class"
84

+ (cluster.clusterInstance(dataSet
85

.instance(lineNum)) + 1);
86

out.println(line);
87

lineNum++;
88

}
89

}
90

out.close();
91

} catch (FileNotFoundException e) {
92

e.printStackTrace();
93

} catch (Exception e) {
94

e.printStackTrace();
95

}
96

}
97

98

public int clusterNewInstance(weka.core.Instance instance) {
99

int indexOfCluster = -1;
100

try {
101

indexOfCluster = cluster.clusterInstance(instance);
102

//System.out.println("cluster " + indexOfCluster);
103

} catch (Exception e) {
104

e.printStackTrace();
105

}
106

return indexOfCluster;
107

}
108

109

public double[] frequencyOfCluster() {
110

int[] sum = new int[this.numOfClusters];
111

try {
112

for (int i = 0; i < this.sizeOfDataset; i++) {
113

sum[cluster.clusterInstance(dataSet.instance(i))]++;
114

}
115

} catch (Exception e) {
116

e.printStackTrace();
117

}
118

double[] fre = new double[sum.length];
119

for (int i = 0; i < sum.length; i++) {
120

fre[i] = (double)sum[i] / (double)this.sizeOfDataset;
121

}
122

return fre;
123

}
124

125

public static void main(String[] args) {
126

File file = new File("cpu.arff");
127

WekaCluster wc = new WekaCluster(file);
128

double[] fre = wc.frequencyOfCluster();
129

for(int i=0;i<fre.length;i++)
130

System.out.println(fre[i]);
131

// wc.newArffWriter(file);
132

double[] feature = { 125,256,6000,256,16,128,199 };
133

weka.core.Instance ins = new weka.core.Instance(7);
134

for (int i = 0; i < ins.numAttributes(); i++) {
135

ins.setValue(i, feature[i]);
136

// System.out.println(ins.attribute(i).getLowerNumericBound());
137

}
138

System.out.println("cluster in : "+wc.clusterNewInstance(ins));
139

}
140

141

}
142

@relation 'cpu'
@attribute MYCT real
@attribute MMIN real
@attribute MMAX real
@attribute CACH real
@attribute CHMIN real
@attribute CHMAX real
@attribute class real
@data
125,256,6000,256,16,128,199
29,8000,32000,32,8,32,253
29,8000,32000,32,8,32,253
這里摘取了3項(xiàng)。運(yùn)行程序執(zhí)行結(jié)果如下：
0.03827751196172249
0.16267942583732056
0.69377990430622
0.10526315789473684
cluster in : 0
表示聚類方法將數(shù)據(jù)集聚為四類，程序中提供的instance被聚到第一類里。每一類的在總文件中的比率如上顯示。
具體的數(shù)據(jù)挖掘的內(nèi)容就不在這里講述了。只是為大家提供一個(gè)weka的java用法實(shí)現(xiàn)。方便在程序中使用weka。

posted on 2010-11-04 09:24 changedi 閱讀(8374) 評(píng)論(0) 編輯收藏所屬分類: 機(jī)器學(xué)習(xí)

新用戶注冊(cè) 刷新評(píng)論列表


只有注冊(cè)用戶登錄后才能發(fā)表評(píng)論。




網(wǎng)站導(dǎo)航: 博客園 IT新聞 Chat2DB C++博客博問(wèn) 管理
相關(guān)文章: 決策樹和Random Forests——優(yōu)秀的群體智慧 Logistic Regression——用線解決問(wèn)題 weka定制計(jì)劃已添加到github weka特征預(yù)處理的一些tip weka的java使用(3)——特征選擇 weka的java使用(2)——分類 weka的java使用(1)——聚類貝葉斯決策——總結(jié)筆記