Chan Chen Coding...

          External Sorting

          Imagine you have the numbers 1 - 9

          9  7  2  6  3  4  8  5  1 

          And let's suppose that only 3 fit in memory at a time.

          So you'd break them into chunks of 3 and sort each, storing each result in a separate file:

          279 346 158 

          Now you'd open each of the three files as streams and read the first value from each:

          2 3 1 

          Output the lowest value 1, and get the next value from that stream, now you have:

          2 3 5 

          Output the next lowest value 2, and continue onwards until you've outputted the entire sorted list

          package sort.externalsort;

          import java.util.*;
          import java.io.*;
          import java.nio.charset.Charset;

          /**
           * Goal: offer a generic external-memory sorting program in Java.
           * 
           * It must be : - hackable (easy to adapt) - scalable to large files - sensibly
           * efficient.
           * 
           * This software is in the public domain.
           * 
           * Usage: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
           * 
           * You can change the default maximal number of temporary files with the -t
           * flag: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
           * -t 3
           * 
           * For very large files, you might want to use an appropriate flag to allocate
           * more memory to the Java VM: java -Xms2G
           * com/google/code/externalsorting/ExternalSort somefile.txt out.txt
           * 
           * By (in alphabetical order) Philippe Beaudoin, Jon Elsas, Christan Grant,
           * Daniel Haran, Daniel Lemire, April 2010 originally posted at
           * 
          http://www.daniel
           * -lemire.com/blog/archives/2010/04/01/external-memory-sorting-in-java/
           
          */
          public class ExternalSort {

              static int DEFAULTMAXTEMPFILES = 1024;

              // we divide the file into small blocks. If the blocks
              
          // are too small, we shall create too many temporary files.
              
          // If they are too big, we shall be using too much memory.
              public static long estimateBestSizeOfBlocks(File filetobesorted, int maxtmpfiles) {
                  long sizeoffile = filetobesorted.length() * 2;
                  /**
                   * We multiply by two because later on someone insisted on counting the
                   * memory usage as 2 bytes per character. By this model, loading a file with
                   * 1 character will use 2 bytes.
                   
          */
                  // we don't want to open up much more than maxtmpfiles temporary files,
                  
          // better run
                  
          // out of memory first.
                  long blocksize = sizeoffile / maxtmpfiles + (sizeoffile % maxtmpfiles == 0 ? 0 : 1);

                  // on the other hand, we don't want to create many temporary files
                  
          // for naught. If blocksize is smaller than half the free memory, grow it.
                  long freemem = Runtime.getRuntime().freeMemory();
                  if (blocksize < freemem / 2) {
                      blocksize = freemem / 2;
                  }
                  return blocksize;
              }

              /**
               * This will simply load the file by blocks of x rows, then sort them
               * in-memory, and write the result to temporary files that have to be merged
               * later.
               * 
               * 
          @param file
               *          some flat file
               * 
          @param cmp
               *          string comparator
               * 
          @return a list of temporary flat files
               
          */
              public static List<File> sortInBatch(File file, Comparator<String> cmp) throws IOException {
                  return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null);
              }

              /**
               * This will simply load the file by blocks of x rows, then sort them
               * in-memory, and write the result to temporary files that have to be merged
               * later. You can specify a bound on the number of temporary files that will
               * be created.
               * 
               * 
          @param file
               *          some flat file
               * 
          @param cmp
               *          string comparator
               * 
          @param maxtmpfiles
               *          maximal number of temporary files
               * 
          @param Charset
               *          character set to use (can use Charset.defaultCharset())
               * 
          @param tmpdirectory
               *          location of the temporary files (set to null for default location)
               * 
          @return a list of temporary flat files
               
          */
              public static List<File> sortInBatch(File file, Comparator<String> cmp, int maxtmpfiles, Charset cs,
                      File tmpdirectory) throws IOException {
                  List<File> files = new ArrayList<File>();
                  BufferedReader fbr = new BufferedReader(new InputStreamReader(new FileInputStream(file), cs));
                  long blocksize = estimateBestSizeOfBlocks(file, maxtmpfiles);// in bytes

                  try {
                      List<String> tmplist = new ArrayList<String>();
                      String line = "";
                      try {
                          while (line != null) {
                              long currentblocksize = 0;// in bytes
                              while ((currentblocksize < blocksize) && ((line = fbr.readLine()) != null)) { // as
                                                                                                                                                                                          
          // long
                                                                                                                                                                                          
          // as
                                                                                                                                                                                          
          // you
                                                                                                                                                                                          
          // have
                                                                                                                                                                                          
          // enough
                                                                                                                                                                                          
          // memory
                                  tmplist.add(line);
                                  currentblocksize += line.length() * 2; // java uses 16 bits per
                                                                                                                  
          // character?
                              }
                              files.add(sortAndSave(tmplist, cmp, cs, tmpdirectory));
                              tmplist.clear();
                          }
                      } catch (EOFException oef) {
                          if (tmplist.size() > 0) {
                              files.add(sortAndSave(tmplist, cmp, cs, tmpdirectory));
                              tmplist.clear();
                          }
                      }
                  } finally {
                      fbr.close();
                  }
                  return files;
              }

              /**
               * Sort a list and save it to a temporary file
               * 
               * 
          @return the file containing the sorted data
               * 
          @param tmplist
               *          data to be sorted
               * 
          @param cmp
               *          string comparator
               * 
          @param cs
               *          charset to use for output (can use Charset.defaultCharset())
               * 
          @param tmpdirectory
               *          location of the temporary files (set to null for default location)
               
          */
              public static File sortAndSave(List<String> tmplist, Comparator<String> cmp, Charset cs, File tmpdirectory)
                      throws IOException {
                  Collections.sort(tmplist, cmp);
                  File newtmpfile = File.createTempFile("sortInBatch", "flatfile", tmpdirectory);
                  newtmpfile.deleteOnExit();
                  BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(newtmpfile), cs));
                  try {
                      for (String r : tmplist) {
                          fbw.write(r);
                          fbw.newLine();
                      }
                  } finally {
                      fbw.close();
                  }
                  return newtmpfile;
              }

              /**
               * This merges a bunch of temporary flat files
               * 
               * 
          @param files
               * 
          @param output
               *          file
               * 
          @return The number of lines sorted. (P. Beaudoin)
               
          */
              public static int mergeSortedFiles(List<File> files, File outputfile, final Comparator<String> cmp)
                      throws IOException {
                  return mergeSortedFiles(files, outputfile, cmp, Charset.defaultCharset());
              }

              /**
               * This merges a bunch of temporary flat files
               * 
               * 
          @param files
               * 
          @param output
               *          file
               * 
          @param Charset
               *          character set to use to load the strings
               * 
          @return The number of lines sorted. (P. Beaudoin)
               
          */
              public static int mergeSortedFiles(List<File> files, File outputfile, final Comparator<String> cmp,
                      Charset cs) throws IOException {
                  PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(11,
                          new Comparator<BinaryFileBuffer>() {
                              public int compare(BinaryFileBuffer i, BinaryFileBuffer j) {
                                  return cmp.compare(i.peek(), j.peek());
                              }
                          });
                  for (File f : files) {
                      BinaryFileBuffer bfb = new BinaryFileBuffer(f, cs);
                      pq.add(bfb);
                  }
                  BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), cs));
                  int rowcounter = 0;
                  try {
                      while (pq.size() > 0) {
                          BinaryFileBuffer bfb = pq.poll();
                          String r = bfb.pop();
                          fbw.write(r);
                          fbw.newLine();
                          ++rowcounter;
                          if (bfb.empty()) {
                              bfb.fbr.close();
                              bfb.originalfile.delete();// we don't need you anymore
                          } else {
                              pq.add(bfb); // add it back
                          }
                      }
                  } finally {
                      fbw.close();
                      for (BinaryFileBuffer bfb : pq)
                          bfb.close();
                  }
                  return rowcounter;
              }

              public static void main(String[] args) throws IOException {

                  boolean verbose = false;
                  int maxtmpfiles = DEFAULTMAXTEMPFILES;
                  Charset cs = Charset.defaultCharset();
                  String inputfile = null, outputfile = null;
                  for (int param = 0; param < args.length; ++param) {
                      if (args[param].equals("-v") || args[param].equals("--verbose"))
                          verbose = true;
                      else if ((args[param].equals("-t") || args[param].equals("--maxtmpfiles")) && args.length > param + 1) {
                          param++;
                          maxtmpfiles = Integer.parseInt(args[param]);
                      } else if ((args[param].equals("-c") || args[param].equals("--charset")) && args.length > param + 1) {
                          param++;
                          cs = Charset.forName(args[param]);
                      } else {
                          if (inputfile == null)
                              inputfile = args[param];
                          else if (outputfile == null)
                              outputfile = args[param];
                          else
                              System.out.println("Unparsed: " + args[param]);
                      }
                  }
                  if (outputfile == null) {
                      System.out.println("please provide input and output file names");
                      return;
                  }
                  Comparator<String> comparator = new Comparator<String>() {
                      public int compare(String r1, String r2) {
                          return r1.compareTo(r2);
                      }
                  };
                  List<File> l = sortInBatch(new File(inputfile), comparator, maxtmpfiles, cs, null);
                  if (verbose)
                      System.out.println("created " + l.size() + " tmp files");
                  mergeSortedFiles(l, new File(outputfile), comparator, cs);
              }
          }

          class BinaryFileBuffer {
              public static int BUFFERSIZE = 2048;
              public BufferedReader fbr;
              public File originalfile;
              private String cache;
              private boolean empty;

              public BinaryFileBuffer(File f, Charset cs) throws IOException {
                  originalfile = f;
                  fbr = new BufferedReader(new InputStreamReader(new FileInputStream(f), cs), BUFFERSIZE);
                  reload();
              }

              public boolean empty() {
                  return empty;
              }

              private void reload() throws IOException {
                  try {
                      if ((this.cache = fbr.readLine()) == null) {
                          empty = true;
                          cache = null;
                      } else {
                          empty = false;
                      }
                  } catch (EOFException oef) {
                      empty = true;
                      cache = null;
                  }
              }

              public void close() throws IOException {
                  fbr.close();
              }

              public String peek() {
                  if (empty())
                      return null;
                  return cache.toString();
              }

              public String pop() throws IOException {
                  String answer = peek();
                  reload();
                  return answer;
              }

          }

           



          -----------------------------------------------------
          Silence, the way to avoid many problems;
          Smile, the way to solve many problems;

          posted on 2012-07-04 02:27 Chan Chen 閱讀(465) 評(píng)論(0)  編輯  收藏 所屬分類: Algorithm

          主站蜘蛛池模板: 玉门市| 沂源县| 崇左市| 奉化市| 建宁县| 和平区| 永昌县| 乾安县| 团风县| 平泉县| 襄樊市| 永靖县| 明溪县| 渑池县| 潼关县| 柏乡县| 四平市| 闵行区| 郧西县| 安阳县| 汝城县| 安徽省| 津市市| 丹东市| 五台县| 西乡县| 昆明市| 卓资县| 阿坝| 玉门市| 青田县| 腾冲县| 什邡市| 宝清县| 南投县| 静乐县| 南陵县| 永嘉县| 鄄城县| 宁强县| 建瓯市|