隨筆-23  評論-58  文章-0  trackbacks-0
          本隱馬可夫(HMM)中文分詞詞性標注程序 中的 隱馬可夫(HMM)概率模型 是由 PFR人民日報標注語料199801語料庫 生成
          public class HMM
          {
              
          static final String[] states = new String[52];
              
          static final HashMap<String, Double> start_probability = new HashMap<String, Double>();
              
          static final HashMap<String, HashMap<String, Double>> transition_probability = new HashMap<String, HashMap<String, Double>>();
              
          static final HashMap<String, HashMap<String, Double>> emission_probability =new HashMap<String, HashMap<String, Double>>();
              
              
          static
              
          {
                  
          for(int i=0;i<52;i++)
                      states[i]
          =CountPOS.getPOSFromId(i);
              
                  InputStream is 
          = Viterbi.class.getClassLoader().getResourceAsStream("startprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                       
          int ss=0;
                       
          public void execute(String line) {
                           start_probability.put(states[ss], Double.parseDouble(line));
                           ss
          ++;
                       }

                  }
          );

                  is 
          = Viterbi.class.getClassLoader().getResourceAsStream("tranprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                      
          int ss=0;
                      
          public void execute(String line) {
                          HashMap
          <String, Double> t = new HashMap<String, Double>();
                          String[] cc
          =line.split("\t");
                          
          for(int j=0;j<cc.length;j++)
                              t.put(states[j], Double.parseDouble(cc[j]));
                          transition_probability.put(states[ss], t);
                          ss
          ++;
                      }

                  }
          );

                  is 
          = Viterbi.class.getClassLoader().getResourceAsStream("emissionprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                      
          public void execute(String line) {
                          String[] cc
          =line.split("\t");
                          String[] nn
          =cc[1].split(" ");
                          
          for(String n:nn)
                          
          {
                              HashMap
          <String, Double> e=null;
                              String[] bb
          =n.split(":");
                              
          if(emission_probability.containsKey(bb[0]))
                                  e
          =emission_probability.get(bb[0]);
                              
          else
                                  e
          =new HashMap<String, Double>();
                              e.put(cc[
          0], Double.parseDouble(bb[1]));
                              emission_probability.put(bb[
          0], e);
                          }

                      }

                  }
          );
              }

              
              
          public static String[] tagging(String[] observations)
              
          {
                  
          return forward_viterbi(observations,states,start_probability,transition_probability,emission_probability);
              }

              
              
          public static String[]  forward_viterbi(String[] observations, String[] states,HashMap<String, Double> start_probability, HashMap<String, HashMap<String, Double>> transition_probability, HashMap<String, HashMap<String, Double>> emission_probability)
              
          {
                  
          int[][] path=new int[observations.length][states.length];
                  
          double[][] r=new double[observations.length][states.length];
                  
          for(int j=0;j<states.length;j++)
                  
          {
                      
          if(emission_probability.get(states[j])!=null && emission_probability.get(states[j]).get(observations[0])!=null)
                          r[
          0][j]=start_probability.get(states[j])*emission_probability.get(states[j]).get(observations[0]);
                      path[
          0][j]=0;
                  }

                  
                  
          for(int t=1;t<observations.length;t++)
                  
          {
                      
          for(int i=0;i<states.length;i++)
                      
          {
                          
          double tmp=0;int m=0;
                          
          for(int j=0;j<states.length;j++)
                          
          {
                              
          double tem=0;
                              
          if(emission_probability.get(states[i])!=null && emission_probability.get(states[i]).get(observations[t])!=null)
                                  tem
          =r[t-1][j]*transition_probability.get(states[j]).get(states[i]) *emission_probability.get(states[i]).get(observations[t]);
                              
          if(tem>tmp)
                              
          {
                                  tmp
          =tem;
                                  m
          =j;
                              }

                          }

                          r[t][i]
          =tmp;
                          path[t][i]
          =m;
                      }

                  }

                  
                  
          double p=0;int m=0;
                  
          for(int i=0;i<r[0].length;i++)
                  
          {
                      
          if(r[r.length-1][i]>p)
                      
          {
                          p
          =r[r.length-1][i];
                          m
          =i;
                      }

                  }

                  
          //System.out.println("p="+p);
                  int[] trace=new int[observations.length];
                  trace[observations.length
          -1]=m;
                  
          for(int t=observations.length-1;t>0;t--)
                  
          {
                      trace[t
          -1]=path[t][m];
                      m
          =path[t][m];
                  }

                  
                  String[] ret
          =new String[observations.length];
                  
          for(int i=0;i<trace.length;i++)
                      ret[i]
          =states[trace[i]];
                  
          return ret;
              }

              
              
          public static void main(String[] args)
              
          {
                 
          //String[] observations = new String[] {"這些","服務","實體","改","由","當地","有關","部門","管理"};
                  String[] observations = new String[] {"研究","生命","","起源"};
                  String[] ret
          =tagging(observations);
                  
          for(String c:ret)
                      System.out.print(c
          +",");
              }

          }

          posted on 2012-09-14 17:08 nianzai 閱讀(3868) 評論(0)  編輯  收藏 所屬分類: 中文分詞
          主站蜘蛛池模板: 兴义市| 乐陵市| 凉城县| 南投市| 万盛区| 盈江县| 江都市| 黔江区| 綦江县| 普洱| 博客| 家居| 进贤县| 西华县| 丰城市| 阿拉善盟| 沾化县| 南陵县| 六安市| 闸北区| 桑日县| 会理县| 孝感市| 万载县| 临西县| 富民县| 莆田市| 临猗县| 定日县| 宁阳县| 中西区| 长寿区| 汤原县| 公安县| 翼城县| 白山市| 北流市| 班玛县| 图木舒克市| 晴隆县| 福贡县|