#!/usr/bin/python # -*- coding: UTF-8 -*- """Soundex algorithm - Version 2.0 Based on Soundex Algorithm of Mark Pilgrim (mark@diveintopython.org) This program is part of "Dive Into Python", a free Python book for experienced programmers. Visit http://diveintopython.org/ for the latest version. __author__ = "Mark Pilgrim (mark@diveintopython.org)" __version__ = "$Revision: 1.5 $" __date__ = "$Date: 2004/05/11 19:11:21 $" __copyright__ = "Copyright (c) 2004 Mark Pilgrim" __license__ = "Python" """ __author__ = "Florent Carlier (florent.carlier@univ-lemans.fr)" __version__ = "$Revision: 2.0 $" __date__ = "$Date: 2005/05/05 $" import string import re def soundex(source): "convert string to Soundex equivalent" # Soundex requirements: # source string must be at least 1 character if (not source): return "0000" # 0. Replace "-" and " " by "" source = source.replace('-','') source = source.replace(' ','') source = source.upper() # and must consist entirely of letters if (not source.isalpha()): return "0000" allChar = string.uppercase tradChar = "91239129922455912623919292" charToSoundex = string.maketrans(allChar, tradChar) # Soundex algorithm: # 1. make first character uppercase # 2. translate all other characters to Soundex digits digits = source[0] + source[1:].translate(charToSoundex) # 3. remove consecutive duplicates digits2 = digits[0] for d in digits[1:]: if digits2[-1] != d: digits2 += d # 4. remove all "9"s # 5. pad end with "0"s to 4 characters return (digits2.replace('9', '') + '000')[:4] def soundex_fr(source): "convert string to Soundex equivalent with french relation" if (not source): return "0000" source = source.replace('-','') source = source.replace(' ','') # Remplacer caractere accentués source = source.replace('é','E') source = source.replace('è','E') source = source.replace('ë','E') source = source.replace('ê','E') source = source.replace('à','A') source = source.replace('ä','A') source = source.replace('ç','C') source = source.replace('î','I') source = source.replace('ï','I') source = source.replace('ô','O') source = source.replace('ö','O') source = source.replace('ù','U') source = source.replace('ü','U') source = source.replace('û','U') source = source.upper() if (not source.isalpha()): return "0000" allChar = string.uppercase tradChar = "01230970072455012683090908" charToSoundex = string.maketrans(allChar, tradChar) digits = source[0] + source[1:].translate(charToSoundex) digits2 = digits[0] for d in digits[1:]: if digits2[-1] != d: digits2 += d return (digits2.replace('0', '') + '000')[:4] def diff_hamming(soundex1, soundex2): """ return the hamming different """ return "0 : Identique" return "1 : Difference N1" return "2 : Difference N2" return "3 : Difference N3" return "4 : Dissemblable" def soundex2(source): "convert string to Soundex equivalent" # Soundex requirements: # source string must be at least 1 character if (not source): return "0000" # 0. Replace "-" and " " by "" source = source.replace('-','') source = source.replace(' ','') source = source.upper() # and must consist entirely of letters if (not source.isalpha()): return "0000" # Etape : Remplacement groupes lettres par correspondance source = source.replace('GUI','KI') source = source.replace('GUE','KE') source = source.replace('GA' ,'KA') source = source.replace('GO' ,'KO') source = source.replace('GU' ,'K') source = source.replace('CA' ,'KA') source = source.replace('CO' ,'KO') source = source.replace('CU' ,'KU') source = source.replace('Q' ,'K') source = source.replace('CC' ,'K') source = source.replace('CK' ,'K') # Etape : Remplcaement des voyelles VoyelleToA = string.maketrans("AEIOU", "AAAAA") source = source[0] + source[1:].translate(VoyelleToA) # Etape : Remplacement des Prefixe source = source[0:3].replace('MAC', 'MCC') + source[3:] source = source[0:3].replace('SCH', 'SSS') + source[3:] source = source[0:3].replace('ASA', 'AZA') + source[3:] source = source[0:2].replace('KN','NN') + source[2:] source = source[0:2].replace('PH','FF') + source[2:] source = source[0:2].replace('PF','FF') + source[2:] # Etape H -> '' sauf SH et CH source2 = re.sub(r'([^C|S])H', r'\1', source) # Etape Y -> '' sauf A source3 = re.sub(r'([^A])Y', r'\1', source2) # Etape : Suppression des terminaisons A, D, T, S source4 = re.sub(r'(.*)[A|D|T|S]$',r'\1', source3) # Etape : Suppression des A sauf En-tete source5 = source4[0] + source4[1:].replace('A','') # Remove consecutive duplicates source6 = source5[0] for d in source5[1:]: if source6[-1] != d: source6 += d return (source6+ ' ')[:4]