隨筆 - 24, 文章 - 6, 評論 - 70, 引用 - 0
          數(shù)據(jù)加載中……

          Search Crawler 源碼

          /*
          RComponent 網(wǎng)絡(luò)組件 .java版
          提供對FTP, NTP, POP3, SMTP編程組件
          下載
          http://www.rcomponet.com

          */

          import java.awt.BorderLayout;
          import java.awt.Cursor;
          import java.awt.Font;
          import java.awt.GridBagConstraints;
          import java.awt.GridBagLayout;
          import java.awt.Insets;
          import java.awt.event.ActionEvent;
          import java.awt.event.ActionListener;
          import java.awt.event.KeyEvent;
          import java.awt.event.WindowAdapter;
          import java.awt.event.WindowEvent;
          import java.io.BufferedReader;
          import java.io.FileWriter;
          import java.io.InputStreamReader;
          import java.io.PrintWriter;
          import java.net.URL;
          import java.util.ArrayList;
          import java.util.HashMap;
          import java.util.HashSet;
          import java.util.LinkedHashSet;
          import java.util.regex.Matcher;
          import java.util.regex.Pattern;

          import javax.swing.BorderFactory;
          import javax.swing.JButton;
          import javax.swing.JCheckBox;
          import javax.swing.JComboBox;
          import javax.swing.JFrame;
          import javax.swing.JLabel;
          import javax.swing.JMenu;
          import javax.swing.JMenuBar;
          import javax.swing.JMenuItem;
          import javax.swing.JOptionPane;
          import javax.swing.JPanel;
          import javax.swing.JProgressBar;
          import javax.swing.JScrollPane;
          import javax.swing.JSeparator;
          import javax.swing.JTable;
          import javax.swing.JTextField;
          import javax.swing.table.DefaultTableModel;

          // The Search Web Crawler
          public class SearchCrawler extends JFrame {
          ? // Max URLs drop-down values.
          ? private static final String[] MAX_URLS = { "50", "100", "500", "1000" };

          ? // Cache of robot disallow lists.
          ? private HashMap disallowListCache = new HashMap();

          ? // Search GUI controls.
          ? private JTextField startTextField;

          ? private JComboBox maxComboBox;

          ? private JCheckBox limitCheckBox;

          ? private JTextField logTextField;

          ? private JTextField searchTextField;

          ? private JCheckBox caseCheckBox;

          ? private JButton searchButton;

          ? // Search stats GUI controls.
          ? private JLabel crawlingLabel2;

          ? private JLabel crawledLabel2;

          ? private JLabel toCrawlLabel2;

          ? private JProgressBar progressBar;

          ? private JLabel matchesLabel2;

          ? // Table listing search matches.
          ? private JTable table;

          ? // Flag for whether or not crawling is underway.
          ? private boolean crawling;

          ? // Matches log file print writer.
          ? private PrintWriter logFileWriter;

          ? // Constructor for Search Web Crawler.
          ? public SearchCrawler() {
          ??? // Set application title.
          ??? setTitle("Search Crawler");

          ??? // Set window size.
          ??? setSize(600, 600);

          ??? // Handle window closing events.
          ??? addWindowListener(new WindowAdapter() {
          ????? public void windowClosing(WindowEvent e) {
          ??????? actionExit();
          ????? }
          ??? });

          ??? // Set up File menu.
          ??? JMenuBar menuBar = new JMenuBar();
          ??? JMenu fileMenu = new JMenu("File");
          ??? fileMenu.setMnemonic(KeyEvent.VK_F);
          ??? JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
          ??? fileExitMenuItem.addActionListener(new ActionListener() {
          ????? public void actionPerformed(ActionEvent e) {
          ??????? actionExit();
          ????? }
          ??? });
          ??? fileMenu.add(fileExitMenuItem);
          ??? menuBar.add(fileMenu);
          ??? setJMenuBar(menuBar);

          ??? // Set up search panel.
          ??? JPanel searchPanel = new JPanel();
          ??? GridBagConstraints constraints;
          ??? GridBagLayout layout = new GridBagLayout();
          ??? searchPanel.setLayout(layout);

          ??? JLabel startLabel = new JLabel("Start URL:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(startLabel, constraints);
          ??? searchPanel.add(startLabel);

          ??? startTextField = new JTextField();
          ??? startTextField.Text="??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(startTextField, constraints);
          ??? searchPanel.add(startTextField);

          ??? JLabel maxLabel = new JLabel("Max URLs to Crawl:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(maxLabel, constraints);
          ??? searchPanel.add(maxLabel);

          ??? maxComboBox = new JComboBox(MAX_URLS);
          ??? maxComboBox.setEditable(true);
          ??? constraints = new GridBagConstraints();
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(maxComboBox, constraints);
          ??? searchPanel.add(maxComboBox);

          ??? limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.WEST;
          ??? constraints.insets = new Insets(0, 10, 0, 0);
          ??? layout.setConstraints(limitCheckBox, constraints);
          ??? searchPanel.add(limitCheckBox);

          ??? JLabel blankLabel = new JLabel();
          ??? constraints = new GridBagConstraints();
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? layout.setConstraints(blankLabel, constraints);
          ??? searchPanel.add(blankLabel);

          ??? JLabel logLabel = new JLabel("Matches Log File:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(logLabel, constraints);
          ??? searchPanel.add(logLabel);

          ??? String file = System.getProperty("user.dir")
          ??????? + System.getProperty("file.separator") + "crawler.log";
          ??? logTextField = new JTextField(file);
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(logTextField, constraints);
          ??? searchPanel.add(logTextField);

          ??? JLabel searchLabel = new JLabel("Search String:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(searchLabel, constraints);
          ??? searchPanel.add(searchLabel);

          ??? searchTextField = new JTextField();
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? constraints.gridwidth = 2;
          ??? constraints.weightx = 1.0d;
          ??? layout.setConstraints(searchTextField, constraints);
          ??? searchPanel.add(searchTextField);

          ??? caseCheckBox = new JCheckBox("Case Sensitive");
          ??? constraints = new GridBagConstraints();
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? layout.setConstraints(caseCheckBox, constraints);
          ??? searchPanel.add(caseCheckBox);

          ??? searchButton = new JButton("Search");
          ??? searchButton.addActionListener(new ActionListener() {
          ????? public void actionPerformed(ActionEvent e) {
          ??????? actionSearch();
          ????? }
          ??? });
          ??? constraints = new GridBagConstraints();
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 5, 5);
          ??? layout.setConstraints(searchButton, constraints);
          ??? searchPanel.add(searchButton);

          ??? JSeparator separator = new JSeparator();
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 5, 5);
          ??? layout.setConstraints(separator, constraints);
          ??? searchPanel.add(separator);

          ??? JLabel crawlingLabel1 = new JLabel("Crawling:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(crawlingLabel1, constraints);
          ??? searchPanel.add(crawlingLabel1);

          ??? crawlingLabel2 = new JLabel();
          ??? crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(crawlingLabel2, constraints);
          ??? searchPanel.add(crawlingLabel2);

          ??? JLabel crawledLabel1 = new JLabel("Crawled URLs:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(crawledLabel1, constraints);
          ??? searchPanel.add(crawledLabel1);

          ??? crawledLabel2 = new JLabel();
          ??? crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(crawledLabel2, constraints);
          ??? searchPanel.add(crawledLabel2);

          ??? JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(toCrawlLabel1, constraints);
          ??? searchPanel.add(toCrawlLabel1);

          ??? toCrawlLabel2 = new JLabel();
          ??? toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(toCrawlLabel2, constraints);
          ??? searchPanel.add(toCrawlLabel2);

          ??? JLabel progressLabel = new JLabel("Crawling Progress:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 0, 0);
          ??? layout.setConstraints(progressLabel, constraints);
          ??? searchPanel.add(progressLabel);

          ??? progressBar = new JProgressBar();
          ??? progressBar.setMinimum(0);
          ??? progressBar.setStringPainted(true);
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 0, 5);
          ??? layout.setConstraints(progressBar, constraints);
          ??? searchPanel.add(progressBar);

          ??? JLabel matchesLabel1 = new JLabel("Search Matches:");
          ??? constraints = new GridBagConstraints();
          ??? constraints.anchor = GridBagConstraints.EAST;
          ??? constraints.insets = new Insets(5, 5, 10, 0);
          ??? layout.setConstraints(matchesLabel1, constraints);
          ??? searchPanel.add(matchesLabel1);
          ??? matchesLabel2 = new JLabel();
          ??? matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
          ??? constraints = new GridBagConstraints();
          ??? constraints.fill = GridBagConstraints.HORIZONTAL;
          ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
          ??? constraints.insets = new Insets(5, 5, 10, 5);
          ??? layout.setConstraints(matchesLabel2, constraints);
          ??? searchPanel.add(matchesLabel2);

          ??? // Set up matches table.
          ??? table = new JTable(new DefaultTableModel(new Object[][] {},
          ??????? new String[] { "URL" }) {
          ????? public boolean isCellEditable(int row, int column) {
          ??????? return false;
          ????? }
          ??? });

          ??? // Set up Matches panel.
          ??? JPanel matchesPanel = new JPanel();
          ??? matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
          ??? matchesPanel.setLayout(new BorderLayout());
          ??? matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);

          ??? // Add panels to display.
          ??? getContentPane().setLayout(new BorderLayout());
          ??? getContentPane().add(searchPanel, BorderLayout.NORTH);
          ??? getContentPane().add(matchesPanel, BorderLayout.CENTER);
          ? }

          ? // Exit this program.
          ? private void actionExit() {
          ??? System.exit(0);
          ? }

          ? // Handle Search/Stop button being clicked.
          ? private void actionSearch() {
          ??? // If stop button clicked, turn crawling flag off.
          ??? if (crawling) {
          ????? crawling = false;
          ????? return;
          ??? }

          ??? ArrayList errorList = new ArrayList();

          ??? // Validate that start URL has been entered.
          ??? String startUrl = startTextField.getText().trim();
          ??? if (startUrl.length() < 1) {
          ????? errorList.add("Missing Start URL.");
          ??? }
          ??? // Verify start URL.
          ??? else if (verifyUrl(startUrl) == null) {
          ????? errorList.add("Invalid Start URL.");
          ??? }

          ??? // Validate that Max URLs is either empty or is a number.
          ??? int maxUrls = 0;
          ??? String max = ((String) maxComboBox.getSelectedItem()).trim();
          ??? if (max.length() > 0) {
          ????? try {
          ??????? maxUrls = Integer.parseInt(max);
          ????? } catch (NumberFormatException e) {
          ????? }
          ????? if (maxUrls < 1) {
          ??????? errorList.add("Invalid Max URLs value.");
          ????? }
          ??? }

          ??? // Validate that matches log file has been entered.
          ??? String logFile = logTextField.getText().trim();
          ??? if (logFile.length() < 1) {
          ????? errorList.add("Missing Matches Log File.");
          ??? }

          ??? // Validate that search string has been entered.
          ??? String searchString = searchTextField.getText().trim();
          ??? if (searchString.length() < 1) {
          ????? errorList.add("Missing Search String.");
          ??? }

          ??? // Show errors, if any, and return.
          ??? if (errorList.size() > 0) {
          ????? StringBuffer message = new StringBuffer();

          ????? // Concatenate errors into single message.
          ????? for (int i = 0; i < errorList.size(); i++) {
          ??????? message.append(errorList.get(i));
          ??????? if (i + 1 < errorList.size()) {
          ????????? message.append("\n");
          ??????? }
          ????? }

          ????? showError(message.toString());
          ????? return;
          ??? }

          ??? // Remove "www" from start URL if present.
          ??? startUrl = removeWwwFromUrl(startUrl);

          ??? // Start the Search Crawler.
          ??? search(logFile, startUrl, maxUrls, searchString);
          ? }

          ? private void search(final String logFile, final String startUrl,
          ????? final int maxUrls, final String searchString) {
          ??? // Start the search in a new thread.
          ??? Thread thread = new Thread(new Runnable() {
          ????? public void run() {
          ??????? // Show hour glass cursor while crawling is under way.
          ??????? setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

          ??????? // Disable search controls.
          ??????? startTextField.setEnabled(false);
          ??????? maxComboBox.setEnabled(false);
          ??????? limitCheckBox.setEnabled(false);
          ??????? logTextField.setEnabled(false);
          ??????? searchTextField.setEnabled(false);
          ??????? caseCheckBox.setEnabled(false);

          ??????? // Switch Search button to "Stop."
          ??????? searchButton.setText("Stop");

          ??????? // Reset stats.
          ??????? table.setModel(new DefaultTableModel(new Object[][] {},
          ??????????? new String[] { "URL" }) {
          ????????? public boolean isCellEditable(int row, int column) {
          ??????????? return false;
          ????????? }
          ??????? });
          ??????? updateStats(startUrl, 0, 0, maxUrls);

          ??????? // Open matches log file.
          ??????? try {
          ????????? logFileWriter = new PrintWriter(new FileWriter(logFile));
          ??????? } catch (Exception e) {
          ????????? showError("Unable to open matches log file.");
          ????????? return;
          ??????? }

          ??????? // Turn crawling flag on.
          ??????? crawling = true;

          ??????? // Perform the actual crawling.
          ??????? crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
          ??????????? searchString, caseCheckBox.isSelected());

          ??????? // Turn crawling flag off.
          ??????? crawling = false;

          ??????? // Close matches log file.
          ??????? try {
          ????????? logFileWriter.close();
          ??????? } catch (Exception e) {
          ????????? showError("Unable to close matches log file.");
          ??????? }

          ??????? // Mark search as done.
          ??????? crawlingLabel2.setText("Done");

          ??????? // Enable search controls.
          ??????? startTextField.setEnabled(true);
          ??????? maxComboBox.setEnabled(true);
          ??????? limitCheckBox.setEnabled(true);
          ??????? logTextField.setEnabled(true);
          ??????? searchTextField.setEnabled(true);
          ??????? caseCheckBox.setEnabled(true);

          ??????? // Switch search button back to "Search."
          ??????? searchButton.setText("Search");

          ??????? // Return to default cursor.
          ??????? setCursor(Cursor.getDefaultCursor());

          ??????? // Show message if search string not found.
          ??????? if (table.getRowCount() == 0) {
          ????????? JOptionPane
          ????????????? .showMessageDialog(
          ????????????????? SearchCrawler.this,
          ????????????????? "Your Search String was not found. Please try another.",
          ????????????????? "Search String Not Found",
          ????????????????? JOptionPane.WARNING_MESSAGE);
          ??????? }
          ????? }
          ??? });
          ??? thread.start();
          ? }

          ? // Show dialog box with error message.
          ? private void showError(String message) {
          ??? JOptionPane.showMessageDialog(this, message, "Error",
          ??????? JOptionPane.ERROR_MESSAGE);
          ? }

          ? // Update crawling stats.
          ? private void updateStats(String crawling, int crawled, int toCrawl,
          ????? int maxUrls) {
          ??? crawlingLabel2.setText(crawling);
          ??? crawledLabel2.setText("" + crawled);
          ??? toCrawlLabel2.setText("" + toCrawl);

          ??? // Update progress bar.
          ??? if (maxUrls == -1) {
          ????? progressBar.setMaximum(crawled + toCrawl);
          ??? } else {
          ????? progressBar.setMaximum(maxUrls);
          ??? }
          ??? progressBar.setValue(crawled);

          ??? matchesLabel2.setText("" + table.getRowCount());
          ? }

          ? // Add match to matches table and log file.
          ? private void addMatch(String url) {
          ??? // Add URL to matches table.
          ??? DefaultTableModel model = (DefaultTableModel) table.getModel();
          ??? model.addRow(new Object[] { url });

          ??? // Add URL to matches log file.
          ??? try {
          ????? logFileWriter.println(url);
          ??? } catch (Exception e) {
          ????? showError("Unable to log match.");
          ??? }
          ? }

          ? // Verify URL format.
          ? private URL verifyUrl(String url) {
          ??? // Only allow HTTP URLs.
          ??? if (!url.toLowerCase().startsWith("http://"))
          ????? return null;

          ??? // Verify format of URL.
          ??? URL verifiedUrl = null;
          ??? try {
          ????? verifiedUrl = new URL(url);
          ??? } catch (Exception e) {
          ????? return null;
          ??? }

          ??? return verifiedUrl;
          ? }

          ? // Check if robot is allowed to access the given URL.
          ? private boolean isRobotAllowed(URL urlToCheck) {
          ??? String host = urlToCheck.getHost().toLowerCase();

          ??? // Retrieve host's disallow list from cache.
          ??? ArrayList disallowList = (ArrayList) disallowListCache.get(host);

          ??? // If list is not in the cache, download and cache it.
          ??? if (disallowList == null) {
          ????? disallowList = new ArrayList();

          ????? try {
          ??????? URL robotsFileUrl = new URL("http://" + host + "/robots.txt");

          ??????? // Open connection to robot file URL for reading.
          ??????? BufferedReader reader = new BufferedReader(
          ??????????? new InputStreamReader(robotsFileUrl.openStream()));

          ??????? // Read robot file, creating list of disallowed paths.
          ??????? String line;
          ??????? while ((line = reader.readLine()) != null) {
          ????????? if (line.indexOf("Disallow:") == 0) {
          ??????????? String disallowPath = line.substring("Disallow:"
          ??????????????? .length());

          ??????????? // Check disallow path for comments and remove if
          ??????????? // present.
          ??????????? int commentIndex = disallowPath.indexOf("#");
          ??????????? if (commentIndex != -1) {
          ????????????? disallowPath = disallowPath.substring(0,
          ????????????????? commentIndex);
          ??????????? }

          ??????????? // Remove leading or trailing spaces from disallow path.
          ??????????? disallowPath = disallowPath.trim();

          ??????????? // Add disallow path to list.
          ??????????? disallowList.add(disallowPath);
          ????????? }
          ??????? }

          ??????? // Add new disallow list to cache.
          ??????? disallowListCache.put(host, disallowList);
          ????? } catch (Exception e) {
          ??????? /*
          ???????? * Assume robot is allowed since an exception is thrown if the
          ???????? * robot file doesn't exist.
          ???????? */
          ??????? return true;
          ????? }
          ??? }

          ??? /*
          ???? * Loop through disallow list to see if crawling is allowed for the
          ???? * given URL.
          ???? */
          ??? String file = urlToCheck.getFile();
          ??? for (int i = 0; i < disallowList.size(); i++) {
          ????? String disallow = (String) disallowList.get(i);
          ????? if (file.startsWith(disallow)) {
          ??????? return false;
          ????? }
          ??? }

          ??? return true;
          ? }

          ? // Download page at given URL.
          ? private String downloadPage(URL pageUrl) {
          ??? try {
          ????? // Open connection to URL for reading.
          ????? BufferedReader reader = new BufferedReader(new InputStreamReader(
          ????????? pageUrl.openStream()));

          ????? // Read page into buffer.
          ????? String line;
          ????? StringBuffer pageBuffer = new StringBuffer();
          ????? while ((line = reader.readLine()) != null) {
          ??????? pageBuffer.append(line);
          ????? }

          ????? return pageBuffer.toString();
          ??? } catch (Exception e) {
          ??? }

          ??? return null;
          ? }

          ? // Remove leading "www" from a URL's host if present.
          ? private String removeWwwFromUrl(String url) {
          ??? int index = url.indexOf("://www.");
          ??? if (index != -1) {
          ????? return url.substring(0, index + 3) + url.substring(index + 7);
          ??? }

          ??? return (url);
          ? }

          ? // Parse through page contents and retrieve links.
          ? private ArrayList retrieveLinks(URL pageUrl, String pageContents,
          ????? HashSet crawledList, boolean limitHost) {
          ??? // Compile link matching pattern.
          ??? Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
          ??????? Pattern.CASE_INSENSITIVE);
          ??? Matcher m = p.matcher(pageContents);

          ??? // Create list of link matches.
          ??? ArrayList linkList = new ArrayList();
          ??? while (m.find()) {
          ????? String link = m.group(1).trim();

          ????? // Skip empty links.
          ????? if (link.length() < 1) {
          ??????? continue;
          ????? }

          ????? // Skip links that are just page anchors.
          ????? if (link.charAt(0) == '#') {
          ??????? continue;
          ????? }

          ????? // Skip mailto links.
          ????? if (link.indexOf("mailto:") != -1) {
          ??????? continue;
          ????? }

          ????? // Skip JavaScript links.
          ????? if (link.toLowerCase().indexOf("javascript") != -1) {
          ??????? continue;
          ????? }

          ????? // Prefix absolute and relative URLs if necessary.
          ????? if (link.indexOf("://") == -1) {
          ??????? // Handle absolute URLs.
          ??????? if (link.charAt(0) == '/') {
          ????????? link = "http://" + pageUrl.getHost() + link;
          ????????? // Handle relative URLs.
          ??????? } else {
          ????????? String file = pageUrl.getFile();
          ????????? if (file.indexOf('/') == -1) {
          ??????????? link = "http://" + pageUrl.getHost() + "/" + link;
          ????????? } else {
          ??????????? String path = file.substring(0,
          ??????????????? file.lastIndexOf('/') + 1);
          ??????????? link = "http://" + pageUrl.getHost() + path + link;
          ????????? }
          ??????? }
          ????? }

          ????? // Remove anchors from link.
          ????? int index = link.indexOf('#');
          ????? if (index != -1) {
          ??????? link = link.substring(0, index);
          ????? }

          ????? // Remove leading "www" from URL's host if present.
          ????? link = removeWwwFromUrl(link);

          ????? // Verify link and skip if invalid.
          ????? URL verifiedLink = verifyUrl(link);
          ????? if (verifiedLink == null) {
          ??????? continue;
          ????? }

          ????? /*
          ?????? * If specified, limit links to those having the same host as the
          ?????? * start URL.
          ?????? */
          ????? if (limitHost
          ????????? && !pageUrl.getHost().toLowerCase().equals(
          ????????????? verifiedLink.getHost().toLowerCase())) {
          ??????? continue;
          ????? }

          ????? // Skip link if it has already been crawled.
          ????? if (crawledList.contains(link)) {
          ??????? continue;
          ????? }

          ????? // Add link to list.
          ????? linkList.add(link);
          ??? }

          ??? return (linkList);
          ? }

          ? /*
          ?? * Determine whether or not search string is matched in the given page
          ?? * contents.
          ?? */
          ? private boolean searchStringMatches(String pageContents,
          ????? String searchString, boolean caseSensitive) {
          ??? String searchContents = pageContents;

          ??? /*
          ???? * If case-sensitive search, lowercase page contents for comparison.
          ???? */
          ??? if (!caseSensitive) {
          ????? searchContents = pageContents.toLowerCase();
          ??? }
          ??? // Split search string into individual terms.
          ??? Pattern p = Pattern.compile("[\\s]+");
          ??? String[] terms = p.split(searchString);

          ??? // Check to see if each term matches.
          ??? for (int i = 0; i < terms.length; i++) {
          ????? if (caseSensitive) {
          ??????? if (searchContents.indexOf(terms[i]) == -1) {
          ????????? return false;
          ??????? }
          ????? } else {
          ??????? if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
          ????????? return false;
          ??????? }
          ????? }
          ??? }

          ??? return true;
          ? }

          ? // Perform the actual crawling, searching for the search string.
          ? public void crawl(String startUrl, int maxUrls, boolean limitHost,
          ????? String searchString, boolean caseSensitive) {
          ??? // Set up crawl lists.
          ??? HashSet crawledList = new HashSet();
          ??? LinkedHashSet toCrawlList = new LinkedHashSet();

          ??? // Add start URL to the to crawl list.
          ??? toCrawlList.add(startUrl);

          ??? /*
          ???? * Perform actual crawling by looping through the To Crawl list.
          ???? */
          ??? while (crawling && toCrawlList.size() > 0) {
          ????? /*
          ?????? * Check to see if the max URL count has been reached, if it was
          ?????? * specified.
          ?????? */
          ????? if (maxUrls != -1) {
          ??????? if (crawledList.size() == maxUrls) {
          ????????? break;
          ??????? }
          ????? }

          ????? // Get URL at bottom of the list.
          ????? String url = (String) toCrawlList.iterator().next();

          ????? // Remove URL from the To Crawl list.
          ????? toCrawlList.remove(url);

          ????? // Convert string url to URL object.
          ????? URL verifiedUrl = verifyUrl(url);

          ????? // Skip URL if robots are not allowed to access it.
          ????? if (!isRobotAllowed(verifiedUrl)) {
          ??????? continue;
          ????? }

          ????? // Update crawling stats.
          ????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);

          ????? // Add page to the crawled list.
          ????? crawledList.add(url);????? // Download the page at the given URL.
          ????? String pageContents = downloadPage(verifiedUrl);

          ????? /*
          ?????? * If the page was downloaded successfully, retrieve all its links
          ?????? * and then see if it contains the search string.
          ?????? */
          ????? if (pageContents != null && pageContents.length() > 0) {
          ??????? // Retrieve list of valid links from page.
          ??????? ArrayList links = retrieveLinks(verifiedUrl, pageContents,
          ??????????? crawledList, limitHost);

          ??????? // Add links to the To Crawl list.
          ??????? toCrawlList.addAll(links);

          ??????? /*
          ???????? * Check if search string is present in page, and if so, record
          ???????? * a match.
          ???????? */
          ??????? if (searchStringMatches(pageContents, searchString,
          ??????????? caseSensitive)) {
          ????????? addMatch(url);
          ??????? }
          ????? }

          ????? // Update crawling stats.
          ????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
          ??? }
          ? }

          ? // Run the Search Crawler.
          ? public static void main(String[] args) {
          ??? SearchCrawler crawler = new SearchCrawler();
          ??? crawler.show();
          ? }
          }
          /**
          A quantifier determines how many times an expression is matched. The quantifiers are shown here:
          +?? Match one or more.
          *?? Match zero or more.
          ??? Match zero or one.

          */

          /*
          Character Sequence Explanation

          <a Look for the characters "<a".

          \\s+ Look for one or more space characters.

          href Look for the characters "href".

          \\s* Look for zero or more space characters.

          =??? Look for the character "--".

          \\s* Look for zero or more space characters.

          \"?? Look for zero or one quote character.

          (.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group.

          [\">]Look for quote character or greater than (">") character.

          */

          posted on 2006-04-12 13:14 大雁北飛 閱讀(859) 評論(2)  編輯  收藏

          評論

          # re: Search Crawler 源碼  回復(fù)  更多評論   

          是java的嗎? 

          2006-05-12 11:15 | liufei

          # re: Search Crawler 源碼  回復(fù)  更多評論   

          no comment for the code ?!
          2009-10-20 11:20 | yakasima

          只有注冊用戶登錄后才能發(fā)表評論。


          網(wǎng)站導(dǎo)航:
           
          主站蜘蛛池模板: 岳阳市| 文成县| 罗山县| 漳浦县| 南靖县| 平舆县| 大余县| 沙坪坝区| 从江县| 西乡县| 贞丰县| 新津县| 建宁县| 葫芦岛市| 娄烦县| 永平县| 高密市| 洛浦县| 平乐县| 云和县| 临西县| 富阳市| 巴青县| 景宁| 朝阳县| 长岛县| 稻城县| 台中市| 绥棱县| 临颍县| 仁怀市| 吴旗县| 永仁县| 宜君县| 平昌县| 江津市| 德化县| 东宁县| 桦甸市| 丁青县| 南昌市|