Carrot2對文章進行聚類代碼示例
Posted on 2013-01-08 15:08 沙漠中的魚 閱讀(1253) 評論(0) 編輯 收藏 所屬分類: 其他 、Java 、數(shù)據(jù)挖掘public class ClusteringFlyStoneDocument {
public static void main(String[] args) {
final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
//創(chuàng)建需要處理的屬性對象
final Map<String, Object> attributes = Maps.newHashMap();
//需要處理的文檔集合
List<Document> documents = SampleDocumentData.DOCUMENTS_DATA_MINING;
//將文檔添加到聚類屬性中
CommonAttributesDescriptor.attributeBuilder(attributes).documents(documents);
//設(shè)置處理的語言()
MultilingualClusteringDescriptor.attributeBuilder(attributes).defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);
//設(shè)置需要處理的對象,以及聚類的算法
final ProcessingResult englishResult = controller.process(attributes, LingoClusteringAlgorithm.class);
//獲取結(jié)果,打印聚類主題及關(guān)聯(lián)信息
final List<Cluster> clustersByTopic = englishResult.getClusters();
System.out.println("=======聚類主題=====");
for(Cluster cluster : clustersByTopic){
System.out.println("【主題 】" +cluster.getLabel());
List<Document> cDocLst = cluster.getAllDocuments();
for(Document doc : cDocLst){
System.out.println("--------" + doc.getTitle());
}
}
//通過URL進行聚類
final ProcessingResult byDomainClusters = controller.process(documents, null,ByUrlClusteringAlgorithm.class);
final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
System.out.println("=======URL聚類=======");
for(Cluster cluster : clustersByDomain){
System.out.println("【URL】" +cluster.getLabel());
List<Document> cDocLst = cluster.getAllDocuments();
for(Document doc : cDocLst){
System.out.println("----" + doc.getTitle());
}
}
}
}
public static void main(String[] args) {
final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
//創(chuàng)建需要處理的屬性對象
final Map<String, Object> attributes = Maps.newHashMap();
//需要處理的文檔集合
List<Document> documents = SampleDocumentData.DOCUMENTS_DATA_MINING;
//將文檔添加到聚類屬性中
CommonAttributesDescriptor.attributeBuilder(attributes).documents(documents);
//設(shè)置處理的語言()
MultilingualClusteringDescriptor.attributeBuilder(attributes).defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);
//設(shè)置需要處理的對象,以及聚類的算法
final ProcessingResult englishResult = controller.process(attributes, LingoClusteringAlgorithm.class);
//獲取結(jié)果,打印聚類主題及關(guān)聯(lián)信息
final List<Cluster> clustersByTopic = englishResult.getClusters();
System.out.println("=======聚類主題=====");
for(Cluster cluster : clustersByTopic){
System.out.println("【主題 】" +cluster.getLabel());
List<Document> cDocLst = cluster.getAllDocuments();
for(Document doc : cDocLst){
System.out.println("--------" + doc.getTitle());
}
}
//通過URL進行聚類
final ProcessingResult byDomainClusters = controller.process(documents, null,ByUrlClusteringAlgorithm.class);
final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
System.out.println("=======URL聚類=======");
for(Cluster cluster : clustersByDomain){
System.out.println("【URL】" +cluster.getLabel());
List<Document> cDocLst = cluster.getAllDocuments();
for(Document doc : cDocLst){
System.out.println("----" + doc.getTitle());
}
}
}
}