Added basic duplicate song detection, no removal or fancy GUI yet.

This commit is contained in:
Patrick Zimmermann 2013-01-05 18:08:01 +01:00
parent 603249b2a3
commit 2dfb7bec9c
2 changed files with 145 additions and 1 deletions

View File

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2013 Raoul Snyman #
# Portions copyright (c) 2008-2013 Tim Bentley, Gerald Britton, Jonathan #
# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, #
# Meinert Jordan, Armin Köhler, Erik Lundin, Edwin Lunando, Brian T. Meyer. #
# Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias Põldaru, #
# Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, #
# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Dave Warnock, #
# Frode Woldsund, Martin Zibricky, Patrick Zimmermann #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
"""
The :mod:`dreambeamimport` module provides the functionality for importing
DreamBeam songs into the OpenLP database.
"""
import logging
import difflib
from openlp.core.lib import translate
from openlp.plugins.songs.lib.db import Song
from openlp.plugins.songs.lib.ui import SongStrings
log = logging.getLogger(__name__)
class DuplicateSongFinder(object):
"""
The :class:`DreamBeamImport` class provides functionality to search for
and remove duplicate songs.
"""
def __init__(self):
self.minFragmentSize = 5
self.minBlockSize = 70
self.maxTypoSize = 3
def songsProbablyEqual(self, song1, song2):
if len(song1.search_lyrics) < len(song2.search_lyrics):
small = song1.search_lyrics
large = song2.search_lyrics
else:
small = song2.search_lyrics
large = song1.search_lyrics
differ = difflib.SequenceMatcher(a=small, b=large)
diff_tuples = differ.get_opcodes()
diff_no_typos = self.__removeTypos(diff_tuples)
#print(diff_no_typos)
if self.__lengthOfEqualBlocks(diff_no_typos) >= self.minBlockSize or \
self.__lengthOfLongestEqualBlock(diff_no_typos) > len(small)*2/3:
return True
else:
return False
def __opLength(self, opcode):
return max(opcode[2]-opcode[1], opcode[4] - opcode[3])
def __removeTypos(self, diff):
#remove typo at beginning of string
if len(diff) >= 2:
if diff[0][0] != "equal" and self.__opLength(diff[0]) <= self.maxTypoSize and \
self.__opLength(diff[1]) >= self.minFragmentSize:
del diff[0]
#remove typos in the middle of string
if len(diff) >= 3:
for index in range(len(diff)-3, -1, -1):
if self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] != "equal" and self.__opLength(diff[index+1]) <= self.maxTypoSize and \
self.__opLength(diff[index+2]) >= self.minFragmentSize:
del diff[index+1]
#remove typo at the end of string
if len(diff) >= 2:
if self.__opLength(diff[-2]) >= self.minFragmentSize and \
diff[-1][0] != "equal" and self.__opLength(diff[-1]) <= self.maxTypoSize:
del diff[-1]
#merge fragments
for index in range(len(diff)-2, -1, -1):
if diff[index][0] == "equal" and self.__opLength(diff[index]) >= self.minFragmentSize and \
diff[index+1][0] == "equal" and self.__opLength(diff[index+1]) >= self.minFragmentSize:
diff[index] = ("equal", diff[index][1], diff[index+1][2], diff[index][3],
diff[index+1][4])
del diff[index+1]
return diff
def __lengthOfEqualBlocks(self, diff):
length = 0
for element in diff:
if element[0] == "equal" and self.__opLength(element) >= self.minBlockSize:
length += self.__opLength(element)
return length
def __lengthOfLongestEqualBlock(self, diff):
length = 0
for element in diff:
if element[0] == "equal" and self.__opLength(element) > length:
length = self.__opLength(element)
return length

View File

@ -45,6 +45,7 @@ from openlp.plugins.songs.lib import clean_song, upgrade, SongMediaItem, \
from openlp.plugins.songs.lib.db import init_schema, Song from openlp.plugins.songs.lib.db import init_schema, Song
from openlp.plugins.songs.lib.importer import SongFormat from openlp.plugins.songs.lib.importer import SongFormat
from openlp.plugins.songs.lib.olpimport import OpenLPSongImport from openlp.plugins.songs.lib.olpimport import OpenLPSongImport
from openlp.plugins.songs.lib.doublesfinder import DuplicateSongFinder
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -77,10 +78,12 @@ class SongsPlugin(Plugin):
self.songImportItem.setVisible(True) self.songImportItem.setVisible(True)
self.songExportItem.setVisible(True) self.songExportItem.setVisible(True)
self.toolsReindexItem.setVisible(True) self.toolsReindexItem.setVisible(True)
self.toolsFindDuplicates.setVisible(True)
action_list = ActionList.get_instance() action_list = ActionList.get_instance()
action_list.add_action(self.songImportItem, UiStrings().Import) action_list.add_action(self.songImportItem, UiStrings().Import)
action_list.add_action(self.songExportItem, UiStrings().Export) action_list.add_action(self.songExportItem, UiStrings().Export)
action_list.add_action(self.toolsReindexItem, UiStrings().Tools) action_list.add_action(self.toolsReindexItem, UiStrings().Tools)
action_list.add_action(self.toolsFindDuplicates, UiStrings().Tools)
QtCore.QObject.connect(Receiver.get_receiver(), QtCore.QObject.connect(Receiver.get_receiver(),
QtCore.SIGNAL(u'servicemanager_new_service'), QtCore.SIGNAL(u'servicemanager_new_service'),
self.clearTemporarySongs) self.clearTemporarySongs)
@ -122,7 +125,7 @@ class SongsPlugin(Plugin):
def addToolsMenuItem(self, tools_menu): def addToolsMenuItem(self, tools_menu):
""" """
Give the alerts plugin the opportunity to add items to the Give the Songs plugin the opportunity to add items to the
**Tools** menu. **Tools** menu.
``tools_menu`` ``tools_menu``
@ -137,6 +140,12 @@ class SongsPlugin(Plugin):
'Re-index the songs database to improve searching and ordering.'), 'Re-index the songs database to improve searching and ordering.'),
visible=False, triggers=self.onToolsReindexItemTriggered) visible=False, triggers=self.onToolsReindexItemTriggered)
tools_menu.addAction(self.toolsReindexItem) tools_menu.addAction(self.toolsReindexItem)
self.toolsFindDuplicates = create_action(tools_menu, u'toolsFindDuplicates',
text=translate('SongsPlugin', 'Find &duplicate songs'),
statustip=translate('SongsPlugin',
'Find and remove duplicate songs in the song database.'),
visible=False, triggers=self.onToolsFindDuplicatesTriggered)
tools_menu.addAction(self.toolsFindDuplicates)
def onToolsReindexItemTriggered(self): def onToolsReindexItemTriggered(self):
""" """
@ -157,6 +166,25 @@ class SongsPlugin(Plugin):
self.manager.save_objects(songs) self.manager.save_objects(songs)
self.mediaItem.onSearchTextButtonClicked() self.mediaItem.onSearchTextButtonClicked()
def onToolsFindDuplicatesTriggered(self):
"""
Search for duplicates in the song database.
"""
maxSongs = self.manager.get_object_count(Song)
if maxSongs == 0:
return
QtGui.QMessageBox.information(self.formParent,
"Find duplicates called", "Called...")
songs = self.manager.get_all_objects(Song)
for outerSongCounter in range(maxSongs-1):
for innerSongCounter in range(outerSongCounter+1, maxSongs):
doubleFinder = DuplicateSongFinder()
if doubleFinder.songsProbablyEqual(songs[outerSongCounter],
songs[innerSongCounter]):
QtGui.QMessageBox.information(self.formParent,
"Double found", str(innerSongCounter) + " " +
str(outerSongCounter))
def onSongImportItemClicked(self): def onSongImportItemClicked(self):
if self.mediaItem: if self.mediaItem:
self.mediaItem.onImportClick() self.mediaItem.onImportClick()
@ -280,10 +308,12 @@ class SongsPlugin(Plugin):
self.songImportItem.setVisible(False) self.songImportItem.setVisible(False)
self.songExportItem.setVisible(False) self.songExportItem.setVisible(False)
self.toolsReindexItem.setVisible(False) self.toolsReindexItem.setVisible(False)
self.toolsFindDuplicates.setVisible(False)
action_list = ActionList.get_instance() action_list = ActionList.get_instance()
action_list.remove_action(self.songImportItem, UiStrings().Import) action_list.remove_action(self.songImportItem, UiStrings().Import)
action_list.remove_action(self.songExportItem, UiStrings().Export) action_list.remove_action(self.songExportItem, UiStrings().Export)
action_list.remove_action(self.toolsReindexItem, UiStrings().Tools) action_list.remove_action(self.toolsReindexItem, UiStrings().Tools)
action_list.remove_action(self.toolsFindDuplicates, UiStrings().Tools)
Plugin.finalise(self) Plugin.finalise(self)
def clearTemporarySongs(self): def clearTemporarySongs(self):