weka的java使用(1)——聚類
weka是著名的數據挖掘工具,在這里有詳細介紹,IDMer老師的博客里也有比較詳細的用法描述。當然,如果直接使用weka的工具,自然沒有問題,但是如果想用weka的功能在自己的平臺框架中呢?我這里放出一個當初對weka的源碼學習過程,主要是如何調用weka的api。僅供參考,代碼中有什么問題,歡迎郵件聯系。這里簡單講解一下流程。構造方法首先載入一個arff文件,然后調用doCluster()方法進行聚類。
本文用到的arff文件是weka的data目錄里自帶的標準數據集。主要格式如下:
1
/**
2
*
3
*/
4
package edu.tju.ikse.mi.util;
5
6
import java.io.File;
7
import java.io.FileNotFoundException;
8
import java.io.IOException;
9
import java.io.PrintWriter;
10
import java.util.Scanner;
11
12
import edu.tju.ikse.mi.anno.util.CfUtil;
13
14
15
import weka.clusterers.XMeans;
16
import weka.core.Instances;
17
import weka.core.converters.ArffLoader;
18
19
/**
20
* @author Jia Yu
21
* @date 2010-5-28
22
*/
23
public class WekaCluster {
24
25
/**
26
* @param args
27
*/
28
29
private ArffLoader loader;
30
private Instances dataSet;
31
private weka.clusterers.Clusterer cluster;
32
private int numOfClusters;
33
private String newAttribute;
34
private File arffFile;
35
private int sizeOfDataset;
36
37
public WekaCluster(File arffFile) {
38
this.arffFile = arffFile;
39
doCluster();
40
}
41
42
private void doCluster() {
43
loader = new ArffLoader();
44
newAttribute = "";
45
try {
46
loader.setFile(arffFile);
47
dataSet = loader.getDataSet();
48
cluster = new XMeans();
49
cluster.buildClusterer(dataSet);
50
numOfClusters = cluster.numberOfClusters();
51
StringBuilder sb = new StringBuilder();
52
for (int i = 0; i < numOfClusters; i++) {
53
sb.append("s" + (i + 1) + " ");
54
}
55
newAttribute = sb.toString().trim();
56
sizeOfDataset = dataSet.numInstances();
57
} catch (Exception e) {
58
e.printStackTrace();
59
}
60
}
61
62
public void newArffWriter() {
63
int lineNum = 0;
64
try {
65
Scanner input = new Scanner(arffFile);
66
PrintWriter out = new PrintWriter(CfUtil
67
.GetFileNameNoExtFromFileName(arffFile.getName())
68
+ "_classification.arff");
69
70
while (input.hasNext()) {
71
String line = input.nextLine();
72
if (line.startsWith("@relation")) {
73
out.println("@relation" + line.substring(9)
74
+ "_classification");
75
} else if (line.startsWith("@data")) {
76
out.println("@attribute shape {" + newAttribute + "}");
77
out.println("@data");
78
} else if (line.startsWith("@attribute")) {
79
out.println(line);
80
} else if (line.isEmpty()) {
81
out.println();
82
} else {
83
line += ",class"
84
+ (cluster.clusterInstance(dataSet
85
.instance(lineNum)) + 1);
86
out.println(line);
87
lineNum++;
88
}
89
}
90
out.close();
91
} catch (FileNotFoundException e) {
92
e.printStackTrace();
93
} catch (Exception e) {
94
e.printStackTrace();
95
}
96
}
97
98
public int clusterNewInstance(weka.core.Instance instance) {
99
int indexOfCluster = -1;
100
try {
101
indexOfCluster = cluster.clusterInstance(instance);
102
//System.out.println("cluster " + indexOfCluster);
103
} catch (Exception e) {
104
e.printStackTrace();
105
}
106
return indexOfCluster;
107
}
108
109
public double[] frequencyOfCluster() {
110
int[] sum = new int[this.numOfClusters];
111
try {
112
for (int i = 0; i < this.sizeOfDataset; i++) {
113
sum[cluster.clusterInstance(dataSet.instance(i))]++;
114
}
115
} catch (Exception e) {
116
e.printStackTrace();
117
}
118
double[] fre = new double[sum.length];
119
for (int i = 0; i < sum.length; i++) {
120
fre[i] = (double)sum[i] / (double)this.sizeOfDataset;
121
}
122
return fre;
123
}
124
125
public static void main(String[] args) {
126
File file = new File("cpu.arff");
127
WekaCluster wc = new WekaCluster(file);
128
double[] fre = wc.frequencyOfCluster();
129
for(int i=0;i<fre.length;i++)
130
System.out.println(fre[i]);
131
// wc.newArffWriter(file);
132
double[] feature = { 125,256,6000,256,16,128,199 };
133
weka.core.Instance ins = new weka.core.Instance(7);
134
for (int i = 0; i < ins.numAttributes(); i++) {
135
ins.setValue(i, feature[i]);
136
// System.out.println(ins.attribute(i).getLowerNumericBound());
137
}
138
System.out.println("cluster in : "+wc.clusterNewInstance(ins));
139
}
140
141
}
142

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

@relation 'cpu'
@attribute MYCT real
@attribute MMIN real
@attribute MMAX real
@attribute CACH real
@attribute CHMIN real
@attribute CHMAX real
@attribute class real
@data
125,256,6000,256,16,128,199
29,8000,32000,32,8,32,253
29,8000,32000,32,8,32,253
這里摘取了3項。運行程序執行結果如下:
0.03827751196172249
0.16267942583732056
0.69377990430622
0.10526315789473684
cluster in : 0
表示聚類方法將數據集聚為四類,程序中提供的instance被聚到第一類里。每一類的在總文件中的比率如上顯示。
具體的數據挖掘的內容就不在這里講述了。只是為大家提供一個weka的java用法實現。方便在程序中使用weka。
posted on 2010-11-04 09:24 changedi 閱讀(8370) 評論(0) 編輯 收藏 所屬分類: 機器學習