forked from openlp/openlp
Fix BG Chinese bible (Bug #706211)
Fix CSV import to accept the same files V1 did and cater to "not quite" ASCII encoding generated by cp1252 files. bzr-revno: 1255 Fixes: https://launchpad.net/bugs/706211
This commit is contained in:
commit
90b1205e5b
@ -50,14 +50,17 @@ The format of the books file is:
|
|||||||
...
|
...
|
||||||
40,2,Matthew,Matt
|
40,2,Matthew,Matt
|
||||||
|
|
||||||
The format of the verses file is:
|
There are two acceptable formats of the verses file. They are:
|
||||||
|
|
||||||
<book_id>,<chapter_number>,<verse_number>,<verse_text>
|
<book_id>,<chapter_number>,<verse_number>,<verse_text>
|
||||||
|
or
|
||||||
|
<book_name>,<chapter_number>,<verse_number>,<verse_text>
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
1,1,1,"In the beginning God created the heaven and the earth."
|
1,1,1,"In the beginning God created the heaven and the earth."
|
||||||
1,1,2,"And the earth was without form, and void; and darkness...."
|
or
|
||||||
|
"Genesis",1,2,"And the earth was without form, and void; and...."
|
||||||
|
|
||||||
All CSV files are expected to use a comma (',') as the delimeter and double
|
All CSV files are expected to use a comma (',') as the delimeter and double
|
||||||
quotes ('"') as the quote symbol.
|
quotes ('"') as the quote symbol.
|
||||||
@ -172,15 +175,22 @@ class CSVBible(BibleDB):
|
|||||||
for line in verse_reader:
|
for line in verse_reader:
|
||||||
if self.stop_import_flag:
|
if self.stop_import_flag:
|
||||||
break
|
break
|
||||||
if book_ptr != book_list[int(line[0])]:
|
try:
|
||||||
book = self.get_book(book_list[int(line[0])])
|
line_book = book_list[int(line[0])]
|
||||||
|
except ValueError:
|
||||||
|
line_book = unicode(line[0], details['encoding'])
|
||||||
|
if book_ptr != line_book:
|
||||||
|
book = self.get_book(line_book)
|
||||||
book_ptr = book.name
|
book_ptr = book.name
|
||||||
self.wizard.incrementProgressBar(unicode(translate(
|
self.wizard.incrementProgressBar(unicode(translate(
|
||||||
'BibleDB.Wizard', 'Importing verses from %s...',
|
'BibleDB.Wizard', 'Importing verses from %s...',
|
||||||
'Importing verses from <book name>...')) % book.name)
|
'Importing verses from <book name>...')) % book.name)
|
||||||
self.session.commit()
|
self.session.commit()
|
||||||
self.create_verse(book.id, line[1], line[2],
|
try:
|
||||||
unicode(line[3], details['encoding']))
|
verse_text = unicode(line[3], details['encoding'])
|
||||||
|
except UnicodeError:
|
||||||
|
verse_text = unicode(line[3], u'cp1252')
|
||||||
|
self.create_verse(book.id, line[1], line[2], verse_text)
|
||||||
self.wizard.incrementProgressBar(translate('BibleDB.Wizard',
|
self.wizard.incrementProgressBar(translate('BibleDB.Wizard',
|
||||||
'Importing verses... done.'))
|
'Importing verses... done.'))
|
||||||
Receiver.send_message(u'openlp_process_events')
|
Receiver.send_message(u'openlp_process_events')
|
||||||
|
@ -210,6 +210,7 @@ class BGExtract(object):
|
|||||||
cleaner = [(re.compile(' |<br />|\'\+\''), lambda match: '')]
|
cleaner = [(re.compile(' |<br />|\'\+\''), lambda match: '')]
|
||||||
soup = get_soup_for_bible_ref(
|
soup = get_soup_for_bible_ref(
|
||||||
u'http://www.biblegateway.com/passage/?%s' % url_params,
|
u'http://www.biblegateway.com/passage/?%s' % url_params,
|
||||||
|
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='',
|
||||||
cleaner=cleaner)
|
cleaner=cleaner)
|
||||||
if not soup:
|
if not soup:
|
||||||
return None
|
return None
|
||||||
@ -499,7 +500,8 @@ class HTTPBible(BibleDB):
|
|||||||
"""
|
"""
|
||||||
return HTTPBooks.get_verse_count(book, chapter)
|
return HTTPBooks.get_verse_count(book, chapter)
|
||||||
|
|
||||||
def get_soup_for_bible_ref(reference_url, header=None, cleaner=None):
|
def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
|
||||||
|
pre_parse_substitute=None, cleaner=None):
|
||||||
"""
|
"""
|
||||||
Gets a webpage and returns a parsed and optionally cleaned soup or None.
|
Gets a webpage and returns a parsed and optionally cleaned soup or None.
|
||||||
|
|
||||||
@ -509,6 +511,13 @@ def get_soup_for_bible_ref(reference_url, header=None, cleaner=None):
|
|||||||
``header``
|
``header``
|
||||||
An optional HTTP header to pass to the bible web server.
|
An optional HTTP header to pass to the bible web server.
|
||||||
|
|
||||||
|
``pre_parse_regex``
|
||||||
|
A regular expression to run on the webpage. Allows manipulation of the
|
||||||
|
webpage before passing to BeautifulSoup for parsing.
|
||||||
|
|
||||||
|
``pre_parse_substitute``
|
||||||
|
The text to replace any matches to the regular expression with.
|
||||||
|
|
||||||
``cleaner``
|
``cleaner``
|
||||||
An optional regex to use during webpage parsing.
|
An optional regex to use during webpage parsing.
|
||||||
"""
|
"""
|
||||||
@ -518,12 +527,15 @@ def get_soup_for_bible_ref(reference_url, header=None, cleaner=None):
|
|||||||
if not page:
|
if not page:
|
||||||
send_error_message(u'download')
|
send_error_message(u'download')
|
||||||
return None
|
return None
|
||||||
|
page_source = page.read()
|
||||||
|
if pre_parse_regex and pre_parse_substitute is not None:
|
||||||
|
page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source)
|
||||||
soup = None
|
soup = None
|
||||||
try:
|
try:
|
||||||
if cleaner:
|
if cleaner:
|
||||||
soup = BeautifulSoup(page, markupMassage=cleaner)
|
soup = BeautifulSoup(page_source, markupMassage=cleaner)
|
||||||
else:
|
else:
|
||||||
soup = BeautifulSoup(page)
|
soup = BeautifulSoup(page_source)
|
||||||
except HTMLParseError:
|
except HTMLParseError:
|
||||||
log.exception(u'BeautifulSoup could not parse the bible page.')
|
log.exception(u'BeautifulSoup could not parse the bible page.')
|
||||||
if not soup:
|
if not soup:
|
||||||
|
Loading…
Reference in New Issue
Block a user