Fixes getting bible texts from CrossWalk

Fix to handling of form feed char and vertical tab char.

bzr-revno: 2653
This commit is contained in:
second@tgc.dk 2016-09-24 12:15:07 +02:00 committed by Raoul Snyman
commit 92ccb1681c
3 changed files with 7 additions and 4 deletions

View File

@ -480,7 +480,7 @@ class CWExtract(RegistryProperties):
for verse in verses_div:
self.application.process_events()
verse_number = int(verse.find('strong').contents[0])
verse_span = verse.find('span')
verse_span = verse.find('span', class_='verse-%d' % verse_number)
tags_to_remove = verse_span.find_all(['a', 'sup'])
for tag in tags_to_remove:
tag.decompose()

View File

@ -93,7 +93,7 @@ class MediaShoutImport(SongImport):
self.song_book_name = song.SongID
for verse in verses:
tag = VERSE_TAGS[verse.Type] + str(verse.Number) if verse.Type < len(VERSE_TAGS) else 'O'
self.add_verse(verse.Text, tag)
self.add_verse(self.tidy_text(verse.Text), tag)
for order in verse_order:
if order.Type < len(VERSE_TAGS):
self.verse_order_list.append(VERSE_TAGS[order.Type] + str(order.Number))

View File

@ -140,10 +140,13 @@ class SongImport(QtCore.QObject):
text = text.replace('\u2026', '...')
text = text.replace('\u2013', '-')
text = text.replace('\u2014', '-')
# Replace vertical tab with 2 linebreaks
text = text.replace('\v', '\n\n')
# Replace form feed (page break) with 2 linebreaks
text = text.replace('\f', '\n\n')
# Remove surplus blank lines, spaces, trailing/leading spaces
text = re.sub(r'[ \t\v]+', ' ', text)
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r' ?(\r\n?|\n) ?', '\n', text)
text = re.sub(r' ?(\n{5}|\f)+ ?', '\f', text)
return text
def process_song_text(self, text):