Bible reference refactor

This commit is contained in:
Jon Tibble 2010-07-29 15:36:02 +01:00
parent 53625d08b6
commit fc849398a9
4 changed files with 164 additions and 261 deletions

View File

@ -23,8 +23,166 @@
# with this program; if not, write to the Free Software Foundation, Inc., 59 # # with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Temple Place, Suite 330, Boston, MA 02111-1307 USA #
############################################################################### ###############################################################################
"""
The :mod:`lib` module contains all the library functionality for the bibles
plugin.
"""
import logging
import re
log = logging.getLogger(__name__)
###############################################################################
# BIBLE_REFERENCE regular expression produces the following match groups:
#
# 0 This is a special group consisting of the whole string that matched.
# 1 [\w ]+ The book the reference is from.
# 2 [0-9]+ The first (possibly only) chapter in the reference.
# 3 None|[0-9]+ None or the only verse or the first verse in a
# verse range or the start verse in a chapter range.
# 4 None|[0-9]+|end None or the end verse of the first verse range or
# the end chapter of a chapter range.
# 5 None|[0-9]+ None or the second chapter in multiple (non-ranged)
# chapters.
# 6 None|[0-9]+|end None, the start of the second verse range or the
# end of a chapter range.
# 7 None|[0-9]+|end None or the end of the second verse range.
###############################################################################
BIBLE_REFERENCE = re.compile(
r'^([\w ]+?) *([0-9]+)' # Initial book and chapter
r'(?: *[:|v|V] *([0-9]+))?' # Verse for first chapter
r'(?: *- *([0-9]+|end$))?' # Range for verses or chapters
r'(?:(?:,([0-9]+))?' # Second chapter
r' *[,|:|v|V] *([0-9]+|end$)' # More range for verses or chapters
r'(?: *- *([0-9]+|end$))?)?$', # End of second verse range
re.UNICODE)
def check_end(match_group):
"""
Check if a regular expression match group contains the text u'end' or
should be converted to an int.
``match_group``
The match group to check.
"""
if match_group == u'end':
return -1
else:
return int(match_group)
def parse_reference(reference):
"""
This is the über-awesome function that takes a person's typed in string
and converts it to a reference list, a list of references to be queried
from the Bible database files.
The reference list is a list of tuples, with each tuple structured like
this::
(book, chapter, start_verse, end_verse)
``reference``
The bible reference to parse.
Returns None or a reference list.
"""
reference = reference.strip()
log.debug('parse_reference("%s")', reference)
unified_ref_list = []
match = BIBLE_REFERENCE.match(reference)
if match:
log.debug(u'Matched reference %s' % reference)
book = match.group(1)
chapter = int(match.group(2))
if match.group(7):
# Two verse ranges
vr1_start = int(match.group(3))
vr1_end = int(match.group(4))
unified_ref_list.append((book, chapter, vr1_start, vr1_end))
vr2_start = int(match.group(6))
vr2_end = check_end(match.group(7))
if match.group(5):
# One verse range per chapter
chapter2 = int(match.group(5))
unified_ref_list.append((book, chapter2, vr2_start, vr2_end))
else:
unified_ref_list.append((book, chapter, vr2_start, vr2_end))
elif match.group(6):
# Chapter range with verses
if match.group(3):
vr1_start = int(match.group(3))
else:
vr1_start = 1
if match.group(2) == match.group(4):
vr1_end = int(match.group(6))
unified_ref_list.append((book, chapter, vr1_start, vr1_end))
else:
vr1_end = -1
unified_ref_list.append((book, chapter, vr1_start, vr1_end))
vr2_end = check_end(match.group(6))
if int(match.group(4)) > chapter:
for x in range(chapter + 1, int(match.group(4)) + 1):
if x == int(match.group(4)):
unified_ref_list.append((book, x, 1, vr2_end))
else:
unified_ref_list.append((book, x, 1, -1))
elif match.group(4):
# Chapter range or chapter and verse range
if match.group(3):
vr1_start = int(match.group(3))
vr1_end = check_end(match.group(4))
if vr1_end == -1 or vr1_end > vr1_start:
unified_ref_list.append((book, chapter, vr1_start, vr1_end))
else:
log.debug(u'Ambiguous reference: %s' % reference)
return None
elif match.group(4) != u'end':
for x in range(chapter, int(match.group(4)) + 1):
unified_ref_list.append((book, x, 1, -1))
else:
log.debug(u'Unsupported reference: %s' % reference)
return None
elif match.group(3):
# Single chapter and verse
verse = int(match.group(3))
unified_ref_list.append((book, chapter, verse, verse))
else:
# Single chapter
unified_ref_list.append((book, chapter, -1, -1))
else:
log.debug(u'Invalid reference: %s' % reference)
return None
return unified_ref_list
class SearchResults(object):
"""
Encapsulate a set of search results. This is Bible-type independant.
"""
def __init__(self, book, chapter, verselist):
"""
Create the search result object.
``book``
The book of the Bible.
``chapter``
The chapter of the book.
``verselist``
The list of verses for this reading
"""
self.book = book
self.chapter = chapter
self.verselist = verselist
def has_verselist(self):
"""
Returns whether or not the verse list contains verses.
"""
return len(self.verselist) > 0
from common import BibleCommon
from manager import BibleManager from manager import BibleManager
from biblestab import BiblesTab from biblestab import BiblesTab
from mediaitem import BibleMediaItem from mediaitem import BibleMediaItem

View File

@ -1,256 +0,0 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=80 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2010 Raoul Snyman #
# Portions copyright (c) 2008-2010 Tim Bentley, Jonathan Corwin, Michael #
# Gorven, Scott Guerrieri, Meinert Jordan, Andreas Preikschat, Christian #
# Richter, Philip Ridout, Maikel Stuivenberg, Martin Thompson, Jon Tibble, #
# Carsten Tinggaard, Frode Woldsund #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
import urllib2
import logging
import re
import chardet
import htmlentitydefs
only_verses = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*([0-9]+)'
r'(?:[ ]*-[ ]*([0-9]+|end))?(?:[ ]*,[ ]*([0-9]+)'
r'(?:[ ]*-[ ]*([0-9]+|end))?)?',
re.UNICODE)
chapter_range = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*'
r'([0-9]+|end)[ ]*-[ ]*([0-9]+)[ ]*[:|v|V][ ]*([0-9]+|end)',
re.UNICODE)
log = logging.getLogger(__name__)
def parse_reference(reference):
"""
This is the über-awesome function that takes a person's typed in string
and converts it to a reference list, a list of references to be queried
from the Bible database files.
The reference list is a list of tuples, with each tuple structured like
this::
(book, chapter, start_verse, end_verse)
"""
reference = reference.strip()
log.debug('parse_reference("%s")', reference)
reference_list = []
# We start with the most "complicated" match first, so that they are found
# first, and we don't have any "false positives".
match = chapter_range.match(reference)
if match:
log.debug('Found a chapter range.')
book = match.group(1)
from_verse = match.group(3)
to_verse = match.group(5)
if int(match.group(2)) == int(match.group(4)):
reference_list.append(
(book, int(match.group(2)), from_verse, to_verse)
)
else:
if int(match.group(2)) > int(match.group(4)):
from_chapter = int(match.group(4))
to_chapter = int(match.group(2))
else:
from_chapter = int(match.group(2))
to_chapter = int(match.group(4))
for chapter in xrange(from_chapter, to_chapter + 1):
if chapter == from_chapter:
reference_list.append((book, chapter, from_verse, -1))
elif chapter == to_chapter:
reference_list.append((book, chapter, 1, to_verse))
else:
reference_list.append((book, chapter, 1, -1))
else:
match = only_verses.match(reference)
if match:
log.debug('Found a verse range.')
book = match.group(1)
chapter = match.group(2)
verse = match.group(3)
if match.group(4) is None:
reference_list.append((book, chapter, verse, verse))
elif match.group(5) is None:
end_verse = match.group(4)
if end_verse == u'end':
end_verse = -1
reference_list.append((book, chapter, verse, end_verse))
elif match.group(6) is None:
reference_list.extend([
(book, chapter, verse, match.group(4)),
(book, chapter, match.group(5), match.group(5))
])
else:
end_verse = match.group(6)
if end_verse == u'end':
end_verse = -1
reference_list.extend([
(book, chapter, verse, match.group(4)),
(book, chapter, match.group(5), end_verse)
])
else:
log.debug('Didn\'t find anything.')
log.debug(reference_list)
return reference_list
class SearchResults(object):
"""
Encapsulate a set of search results. This is Bible-type independant.
"""
def __init__(self, book, chapter, verselist):
"""
Create the search result object.
``book``
The book of the Bible.
``chapter``
The chapter of the book.
``verselist``
The list of verses for this reading
"""
self.book = book
self.chapter = chapter
self.verselist = verselist
def has_verselist(self):
"""
Returns whether or not the verse list contains verses.
"""
return len(self.verselist) > 0
class BibleCommon(object):
"""
A common ancestor for bible download sites.
"""
log.info(u'BibleCommon')
def _get_web_text(self, urlstring, proxyurl):
"""
Get the HTML from the web page.
``urlstring``
The URL of the page to open.
``proxyurl``
The URL of a proxy server used to access the Internet.
"""
log.debug(u'get_web_text %s %s', proxyurl, urlstring)
if proxyurl:
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
http_support = urllib2.HTTPHandler()
opener = urllib2.build_opener(proxy_support, http_support)
urllib2.install_opener(opener)
xml_string = u''
req = urllib2.Request(urlstring)
#Make us look like an IE Browser on XP to stop blocking by web site
req.add_header(u'User-Agent',
u'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
try:
handle = urllib2.urlopen(req)
html = handle.read()
details = chardet.detect(html)
xml_string = unicode(html, details[u'encoding'])
except IOError, e:
if hasattr(e, u'reason'):
log.exception(u'Reason for failure: %s', e.reason)
return xml_string
def _clean_text(self, text):
"""
Clean up text and remove extra characters after been downloaded from
the Internet.
``text``
The text from the web page that needs to be cleaned up.
"""
#return text.rstrip()
# Remove Headings from the Text
start_tag = text.find(u'<h')
while start_tag > -1:
end_tag = text.find(u'</h', start_tag)
text = text[:(start_tag - 1)] + text[(end_tag + 4)]
start_tag = text.find(u'<h')
# Remove Support References from the Text
start_tag = text.find(u'<sup>')
while start_tag > -1:
end_tag = text.find(u'</sup>')
text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<sup>')
start_tag = text.find(u'<SUP>')
while start_tag > -1:
end_tag = text.find(u'</SUP>')
text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<SUP>')
# Static Clean ups
text = text.replace(u'\n', u'')
text = text.replace(u'\r', u'')
text = text.replace(u'&nbsp;', u'')
text = text.replace(u'<P>', u'')
text = text.replace(u'<I>', u'')
text = text.replace(u'</I>', u'')
text = text.replace(u'<P />', u'')
text = text.replace(u'<p />', u'')
text = text.replace(u'</P>', u'')
text = text.replace(u'<BR>', u'')
text = text.replace(u'<BR />', u'')
text = text.replace(u'&quot;', u'\"')
text = text.replace(u'&apos;', u'\'')
# Remove some other tags
start_tag = text.find(u'<')
while start_tag > -1:
end_tag = text.find(u'>', start_tag)
text = text[:start_tag] + text[end_tag + 1:]
start_tag = text.find(u'<')
text = text.replace(u'>', u'')
return text.rstrip().lstrip()
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
Courtesy of Fredrik Lundh, http://effbot.org/zone/re-sub.htm#unescape-html
@param text The HTML (or XML) source text.
@return The plain text, as a Unicode string, if necessary.
"""
def fixup(markup):
text = markup.group(0)
if text.startswith(u'&#'):
# character reference
try:
if text.startswith(u'&#x'):
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub(u'&#?\w+;', fixup, text)

View File

@ -36,7 +36,7 @@ from BeautifulSoup import BeautifulSoup, NavigableString
from openlp.core.lib import Receiver from openlp.core.lib import Receiver
from openlp.core.utils import AppLocation from openlp.core.utils import AppLocation
from openlp.plugins.bibles.lib.common import BibleCommon, SearchResults from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.db import BibleDB, Book from openlp.plugins.bibles.lib.db import BibleDB, Book
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -177,7 +177,7 @@ class HTTPBooks(object):
return 0 return 0
class BGExtract(BibleCommon): class BGExtract(object):
""" """
Extract verses from BibleGateway Extract verses from BibleGateway
""" """
@ -239,7 +239,8 @@ class BGExtract(BibleCommon):
found_count += 1 found_count += 1
return SearchResults(bookname, chapter, verse_list) return SearchResults(bookname, chapter, verse_list)
class CWExtract(BibleCommon):
class CWExtract(object):
""" """
Extract verses from CrossWalk/BibleStudyTools Extract verses from CrossWalk/BibleStudyTools
""" """

View File

@ -30,9 +30,9 @@ from PyQt4 import QtCore
from openlp.core.lib import SettingsManager from openlp.core.lib import SettingsManager
from openlp.core.utils import AppLocation from openlp.core.utils import AppLocation
from openlp.plugins.bibles.lib import parse_reference
from openlp.plugins.bibles.lib.db import BibleDB, BibleMeta from openlp.plugins.bibles.lib.db import BibleDB, BibleMeta
from common import parse_reference
from opensong import OpenSongBible from opensong import OpenSongBible
from osis import OSISBible from osis import OSISBible
from csvbible import CSVBible from csvbible import CSVBible