# -*- coding: utf-8 -*- # vim: autoindent shiftwidth=4 expandtab textwidth=80 tabstop=4 softtabstop=4 ############################################################################### # OpenLP - Open Source Lyrics Projection # # --------------------------------------------------------------------------- # # Copyright (c) 2008-2010 Raoul Snyman # # Portions copyright (c) 2008-2010 Tim Bentley, Jonathan Corwin, Michael # # Gorven, Scott Guerrieri, Christian Richter, Maikel Stuivenberg, Martin # # Thompson, Jon Tibble, Carsten Tinggaard # # --------------------------------------------------------------------------- # # This program is free software; you can redistribute it and/or modify it # # under the terms of the GNU General Public License as published by the Free # # Software Foundation; version 2 of the License. # # # # This program is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # # more details. # # # # You should have received a copy of the GNU General Public License along # # with this program; if not, write to the Free Software Foundation, Inc., 59 # # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### import urllib2 import logging import re import chardet import htmlentitydefs only_verses = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*([0-9]+)' r'(?:[ ]*-[ ]*([0-9]+|end))?(?:[ ]*,[ ]*([0-9]+)(?:[ ]*-[ ]*([0-9]+|end))?)?', re.UNICODE) chapter_range = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*' r'([0-9]+|end)[ ]*-[ ]*([0-9]+)[ ]*[:|v|V][ ]*([0-9]+|end)', re.UNICODE) log = logging.getLogger(__name__) def parse_reference(reference): """ This is the über-awesome function that takes a person's typed in string and converts it to a reference list, a list of references to be queried from the Bible database files. The reference list is a list of tuples, with each tuple structured like this:: (book, chapter, start_verse, end_verse) """ reference = reference.strip() log.debug('parse_reference("%s")', reference) reference_list = [] # We start with the most "complicated" match first, so that they are found # first, and we don't have any "false positives". match = chapter_range.match(reference) if match: log.debug('Found a chapter range.') book = match.group(1) from_verse = match.group(3) to_verse = match.group(5) if int(match.group(2)) == int(match.group(4)): reference_list.append( (match.group(1), int(match.group(2)), from_verse, to_verse) ) else: if int(match.group(2)) > int(match.group(4)): from_chapter = int(match.group(4)) to_chapter = int(match.group(2)) else: from_chapter = int(match.group(2)) to_chapter = int(match.group(4)) for chapter in xrange(from_chapter, to_chapter + 1): if chapter == from_chapter: reference_list.append( (match.group(1), chapter, from_verse, -1) ) elif chapter == to_chapter: reference_list.append( (match.group(1), chapter, 1, to_verse) ) else: reference_list.append( (match.group(1), chapter, 1, -1) ) else: match = only_verses.match(reference) if match: log.debug('Found a verse range.') book = match.group(1) chapter = match.group(2) verse = match.group(3) if match.group(4) is None: reference_list.append((book, chapter, verse, verse)) elif match.group(5) is None: end_verse = match.group(4) if end_verse == u'end': end_verse = -1 reference_list.append((book, chapter, verse, end_verse)) elif match.group(6) is None: reference_list.extend([ (book, chapter, verse, match.group(4)), (book, chapter, match.group(5), match.group(5)) ]) else: end_verse = match.group(6) if end_verse == u'end': end_verse = -1 reference_list.extend([ (book, chapter, verse, match.group(4)), (book, chapter, match.group(5), end_verse) ]) else: log.debug('Didn\'t find anything.') log.debug(reference_list) return reference_list class SearchResults(object): """ Encapsulate a set of search results. This is Bible-type independant. """ def __init__(self, book, chapter, verselist): """ Create the search result object. ``book`` The book of the Bible. ``chapter`` The chapter of the book. ``verselist`` The list of verses for this reading """ self.book = book self.chapter = chapter self.verselist = verselist def get_verselist(self): """ Returns the list of verses. """ return self.verselist def get_book(self): """ Returns the book of the Bible. """ return self.book def get_chapter(self): """ Returns the chapter of the book. """ return self.chapter def has_verselist(self): """ Returns whether or not the verse list contains verses. """ return len(self.verselist) > 0 class BibleCommon(object): """ A common ancestor for bible download sites. """ log.info(u'BibleCommon') def _get_web_text(self, urlstring, proxyurl): """ Get the HTML from the web page. ``urlstring`` The URL of the page to open. ``proxyurl`` The URL of a proxy server used to access the Internet. """ log.debug(u'get_web_text %s %s', proxyurl, urlstring) if proxyurl: proxy_support = urllib2.ProxyHandler({'http': self.proxyurl}) http_support = urllib2.HTTPHandler() opener = urllib2.build_opener(proxy_support, http_support) urllib2.install_opener(opener) xml_string = u'' req = urllib2.Request(urlstring) #Make us look like an IE Browser on XP to stop blocking by web site req.add_header(u'User-Agent', u'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') try: handle = urllib2.urlopen(req) html = handle.read() details = chardet.detect(html) xml_string = unicode(html, details[u'encoding']) except IOError, e: if hasattr(e, u'reason'): log.exception(u'Reason for failure: %s', e.reason) return xml_string def _clean_text(self, text): """ Clean up text and remove extra characters after been downloaded from the Internet. ``text`` The text from the web page that needs to be cleaned up. """ #return text.rstrip() # Remove Headings from the Text start_tag = text.find(u' -1: end_tag = text.find(u'') while start_tag > -1: end_tag = text.find(u'') text = text[:start_tag] + text[end_tag + 6:len(text)] start_tag = text.find(u'') start_tag = text.find(u'') while start_tag > -1: end_tag = text.find(u'') text = text[:start_tag] + text[end_tag + 6:len(text)] start_tag = text.find(u'') # Static Clean ups text = text.replace(u'\n', u'') text = text.replace(u'\r', u'') text = text.replace(u' ', u'') text = text.replace(u'

', u'') text = text.replace(u'', u'') text = text.replace(u'', u'') text = text.replace(u'

', u'') text = text.replace(u'

', u'') text = text.replace(u'

', u'') text = text.replace(u'
', u'') text = text.replace(u'
', u'') text = text.replace(u'"', u'\"') text = text.replace(u''', u'\'') # Remove some other tags start_tag = text.find(u'<') while start_tag > -1 : end_tag = text.find(u'>', start_tag) text = text[:start_tag] + text[end_tag + 1:] start_tag = text.find(u'<') text = text.replace(u'>', u'') return text.rstrip().lstrip() def unescape(text): """ Removes HTML or XML character references and entities from a text string. Courtesy of Fredrik Lundh, http://effbot.org/zone/re-sub.htm#unescape-html @param text The HTML (or XML) source text. @return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) if text[:2] == u'&#': # character reference try: if text[:3] == u'&#x': return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub(u'&#?\w+;', fixup, text)