A
download google.py
Language: Python
License: GPL
Copyright: (C) 2003-2006 Bastian Kleineidam
LOC: 48
Project Info
WebCleaner
Server: SourceForge
Type: cvs
...\webcleaner\webcleaner2\wc\
   __init__.py
   ansicolor.py
   configuration.py
   containers.py
   decorators.py
   dummy.py
   fileutil.py
   google.py
   i18n.py
   ip.py
   levenshtein.c
   log.py
   mail.py
   msgfmt.py
   start.py
   strformat.py
   update.py
   url.py
   win32start.py
   XmlUtils.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2003-2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Google utility functions.
"""

import urllib, urlparse

google_domain = "http://www.google.com"
#google_domain = "https://www.google.de"

# list of http status codes when to try google
google_try_status = (410, 503, 504, )

def get_google_cache_url (url):
    """
    Return google cache url for given url.
    """
    return "%s/search?q=cache:%s" % (google_domain, urllib.quote_plus(url))


def get_google_search_url (query):
    """
    Return google search url for given url.
    """
    return "%s/search?q=%s" % (google_domain, query)


def get_google_clean_url (url):
    """
    Google ignores scheme, query and anchor parts when caching urls.
    Additionally it ignores unusual filenames from urls
    (example: http://slashdot.org/index.pl is not found in cache,
    but http.//slashdot.org/ is found)
    So this function does the following:
     - remove scheme, query and anchor from url
     - if path does not end with ".html" or "/", replace the path with
       the root path "/"
    """
    parts = urlparse.urlsplit(url)
    if not parts[2].endswith(".html") and not parts[2].endswith("/"):
        path = "/"
    else:
        path = parts[2]
    # use only host and path (ie remove scheme, query and fragment)
    return parts[1]+path


def get_google_context (url, response):
    """
    Return google template context for given url and response data.
    """
    url_parts = urlparse.urlsplit(url)
    context = {
      'url': url,
      'response': response,
      'site': url_parts[1],
      'path': url_parts[2],
      'cache_site': get_google_cache_url("http://%s/" % url_parts[1]),
      'cache_path': get_google_cache_url("http://%s%s" % url_parts[1:3]),
      'coral_path': ("http://%s.nyud.net:8090%s" % url_parts[1:3]),
    }
    return context

About Koders | Resources | Downloads | Support | Black Duck | Terms of Service | DMCA | Privacy Policy | Contact Us