隨筆-23  評論-58  文章-0  trackbacks-0
          本隱馬可夫(HMM)中文分詞詞性標注程序 中的 隱馬可夫(HMM)概率模型 是由 PFR人民日報標注語料199801語料庫 生成
          public class HMM
          {
              
          static final String[] states = new String[52];
              
          static final HashMap<String, Double> start_probability = new HashMap<String, Double>();
              
          static final HashMap<String, HashMap<String, Double>> transition_probability = new HashMap<String, HashMap<String, Double>>();
              
          static final HashMap<String, HashMap<String, Double>> emission_probability =new HashMap<String, HashMap<String, Double>>();
              
              
          static
              
          {
                  
          for(int i=0;i<52;i++)
                      states[i]
          =CountPOS.getPOSFromId(i);
              
                  InputStream is 
          = Viterbi.class.getClassLoader().getResourceAsStream("startprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                       
          int ss=0;
                       
          public void execute(String line) {
                           start_probability.put(states[ss], Double.parseDouble(line));
                           ss
          ++;
                       }

                  }
          );

                  is 
          = Viterbi.class.getClassLoader().getResourceAsStream("tranprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                      
          int ss=0;
                      
          public void execute(String line) {
                          HashMap
          <String, Double> t = new HashMap<String, Double>();
                          String[] cc
          =line.split("\t");
                          
          for(int j=0;j<cc.length;j++)
                              t.put(states[j], Double.parseDouble(cc[j]));
                          transition_probability.put(states[ss], t);
                          ss
          ++;
                      }

                  }
          );

                  is 
          = Viterbi.class.getClassLoader().getResourceAsStream("emissionprob.txt");
                  FileUtil.readFileByLine(is, 
          "UTF-8"new Callback(){
                      
          public void execute(String line) {
                          String[] cc
          =line.split("\t");
                          String[] nn
          =cc[1].split(" ");
                          
          for(String n:nn)
                          
          {
                              HashMap
          <String, Double> e=null;
                              String[] bb
          =n.split(":");
                              
          if(emission_probability.containsKey(bb[0]))
                                  e
          =emission_probability.get(bb[0]);
                              
          else
                                  e
          =new HashMap<String, Double>();
                              e.put(cc[
          0], Double.parseDouble(bb[1]));
                              emission_probability.put(bb[
          0], e);
                          }

                      }

                  }
          );
              }

              
              
          public static String[] tagging(String[] observations)
              
          {
                  
          return forward_viterbi(observations,states,start_probability,transition_probability,emission_probability);
              }

              
              
          public static String[]  forward_viterbi(String[] observations, String[] states,HashMap<String, Double> start_probability, HashMap<String, HashMap<String, Double>> transition_probability, HashMap<String, HashMap<String, Double>> emission_probability)
              
          {
                  
          int[][] path=new int[observations.length][states.length];
                  
          double[][] r=new double[observations.length][states.length];
                  
          for(int j=0;j<states.length;j++)
                  
          {
                      
          if(emission_probability.get(states[j])!=null && emission_probability.get(states[j]).get(observations[0])!=null)
                          r[
          0][j]=start_probability.get(states[j])*emission_probability.get(states[j]).get(observations[0]);
                      path[
          0][j]=0;
                  }

                  
                  
          for(int t=1;t<observations.length;t++)
                  
          {
                      
          for(int i=0;i<states.length;i++)
                      
          {
                          
          double tmp=0;int m=0;
                          
          for(int j=0;j<states.length;j++)
                          
          {
                              
          double tem=0;
                              
          if(emission_probability.get(states[i])!=null && emission_probability.get(states[i]).get(observations[t])!=null)
                                  tem
          =r[t-1][j]*transition_probability.get(states[j]).get(states[i]) *emission_probability.get(states[i]).get(observations[t]);
                              
          if(tem>tmp)
                              
          {
                                  tmp
          =tem;
                                  m
          =j;
                              }

                          }

                          r[t][i]
          =tmp;
                          path[t][i]
          =m;
                      }

                  }

                  
                  
          double p=0;int m=0;
                  
          for(int i=0;i<r[0].length;i++)
                  
          {
                      
          if(r[r.length-1][i]>p)
                      
          {
                          p
          =r[r.length-1][i];
                          m
          =i;
                      }

                  }

                  
          //System.out.println("p="+p);
                  int[] trace=new int[observations.length];
                  trace[observations.length
          -1]=m;
                  
          for(int t=observations.length-1;t>0;t--)
                  
          {
                      trace[t
          -1]=path[t][m];
                      m
          =path[t][m];
                  }

                  
                  String[] ret
          =new String[observations.length];
                  
          for(int i=0;i<trace.length;i++)
                      ret[i]
          =states[trace[i]];
                  
          return ret;
              }

              
              
          public static void main(String[] args)
              
          {
                 
          //String[] observations = new String[] {"這些","服務","實體","改","由","當地","有關","部門","管理"};
                  String[] observations = new String[] {"研究","生命","","起源"};
                  String[] ret
          =tagging(observations);
                  
          for(String c:ret)
                      System.out.print(c
          +",");
              }

          }

          posted on 2012-09-14 17:08 nianzai 閱讀(3867) 評論(0)  編輯  收藏 所屬分類: 中文分詞
          主站蜘蛛池模板: 城步| 柯坪县| 滨州市| 安岳县| 永寿县| 辽源市| 秦皇岛市| 青海省| 大渡口区| 澄迈县| 神池县| 象山县| 郑州市| 青海省| 平陆县| 芜湖市| 招远市| 广丰县| 贡嘎县| 镇雄县| 阿拉尔市| 涿鹿县| 峨山| 无棣县| 上杭县| 建宁县| 米泉市| 石河子市| 天峨县| 涟源市| 文登市| 松阳县| 宁城县| 绵阳市| 五寨县| 福建省| 老河口市| 韩城市| 奇台县| 北辰区| 武邑县|