001/* ***** BEGIN LICENSE BLOCK ***** 002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 003 * 004 * The contents of this file are subject to the Mozilla Public License Version 005 * 1.1 (the "License"); you may not use this file except in compliance with 006 * the License. You may obtain a copy of the License at 007 * http://www.mozilla.org/MPL/ 008 * 009 * Software distributed under the License is distributed on an "AS IS" basis, 010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 011 * for the specific language governing rights and limitations under the 012 * License. 013 * 014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in 015 * Java(TM), hosted at https://github.com/gunterze/dcm4che. 016 * 017 * The Initial Developer of the Original Code is 018 * Agfa Healthcare. 019 * Portions created by the Initial Developer are Copyright (C) 2011 020 * the Initial Developer. All Rights Reserved. 021 * 022 * Contributor(s): 023 * See listed authors below. 024 * 025 * Alternatively, the contents of this file may be used under the terms of 026 * either the GNU General Public License Version 2 or later (the "GPL"), or 027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 028 * in which case the provisions of the GPL or the LGPL are applicable instead 029 * of those above. If you wish to allow use of your version of this file only 030 * under the terms of either the GPL or the LGPL, and not to allow others to 031 * use your version of this file under the terms of the MPL, indicate your 032 * decision by deleting the provisions above and replace them with the notice 033 * and other provisions required by the GPL or the LGPL. If you do not delete 034 * the provisions above, a recipient may use your version of this file under 035 * the terms of any one of the MPL, the GPL or the LGPL. 036 * 037 * ***** END LICENSE BLOCK ***** */ 038 039package org.dcm4che3.soundex; 040 041/** 042 * Implementation of the PHONEM substitutions as described in Georg Wilde and 043 * Carsten Meyer, Doppelgaenger gesucht - Ein Programm fuer kontextsensitive 044 * phonetische Textumwandlung ct Magazin fuer Computer & Technik 25/1998. 045 * 046 * @see <a href="http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf" 047 * >Martin Wilz: Aspekte der Kodierung phonetischer Ähnlichkeiten in 048 * deutschen Eigennamen</a> 049 * 050 * @author Gunter Zeilinger <gunterze@gmail.com> 051 */ 052public class Phonem implements FuzzyStr { 053 054 @Override 055 public String toFuzzy(String s) { 056 if (s == null || s.length() == 0) 057 return ""; 058 059 char[] in = s.toUpperCase().toCharArray(); 060 char next = in[0]; 061 int j = 0; 062 for (int i = 1; i < in.length; i++) { 063 char prev = next; 064 switch ((prev << 8) + (next = in[i])) { 065 case 0x5343: // SC 066 case 0x535a: // SZ 067 case 0x435a: // CZ 068 case 0x5453: // TS 069 next = 'C'; 070 break; 071 case 0x4b53: // KS 072 next = 'X'; 073 break; 074 case 0x5046: // PF 075 case 0x5048: // PH 076 next = 'V'; 077 break; 078 case 0x5545: // UE 079 next = 'Y'; 080 break; 081 case 0x4145: // AE 082 prev = 'E'; 083 break; 084 case 0x4f45: // OE 085 next = 'Ö'; 086 break; 087 case 0x4f55: // OU 088 next = '§'; 089 break; 090 case 0x5155: // QU 091 in[j++] = 'K'; 092 next = 'W'; 093 break; 094 case 0x4549: // EI 095 case 0x4559: // EY 096 in[j++] = 'A'; 097 next = 'Y'; 098 break; 099 case 0x4555: // EU 100 in[j++] = 'O'; 101 next = 'Y'; 102 break; 103 case 0x4155: // AU 104 in[j++] = 'A'; 105 next = '§'; 106 break; 107 default: 108 in[j++] = prev; 109 break; 110 } 111 } 112 in[j++] = next; 113 int k = 0; 114 char prev = 0; 115 for (int i = 0; i < j; i++) { 116 char ch = in[i]; 117 switch (ch) { 118 case 'Z': 119 case 'K': 120 case 'G': 121 case 'Q': 122 case 'Ç': 123 ch = 'C'; 124 break; 125 case 'À': 126 case 'Á': 127 case 'Â': 128 case 'Ã': 129 case 'Å': 130 ch = 'A'; 131 break; 132 case 'Ä': 133 case 'Æ': 134 case 'È': 135 case 'É': 136 case 'Ê': 137 case 'Ë': 138 ch = 'E'; 139 break; 140 case 'I': 141 case 'J': 142 case 'Ì': 143 case 'Í': 144 case 'Î': 145 case 'Ï': 146 case 'Ü': 147 case 'Ý': 148 ch = 'Y'; 149 break; 150 case 'Ñ': 151 ch = 'N'; 152 break; 153 case 'Ò': 154 case 'Ó': 155 case 'Ô': 156 case 'Õ': 157 ch = 'O'; 158 break; 159 case 'Ø': 160 ch = 'Ö'; 161 break; 162 case 'ß': 163 ch = 'S'; 164 break; 165 case 'F': 166 case 'W': 167 ch = 'V'; 168 break; 169 case 'P': 170 ch = 'B'; 171 break; 172 case 'T': 173 ch = 'D'; 174 break; 175 case '§': 176 case 'Ù': 177 case 'Ú': 178 case 'Û': 179 ch = 'U'; 180 break; 181 case 'A': 182 case 'B': 183 case 'C': 184 case 'D': 185 case 'L': 186 case 'M': 187 case 'N': 188 case 'O': 189 case 'R': 190 case 'S': 191 case 'U': 192 case 'V': 193 case 'X': 194 case 'Y': 195 case 'Ö': 196 break; 197 default: 198 continue; 199 } 200 if (ch != prev) 201 in[k++] = prev = ch; 202 } 203 return new String(in, 0, k); 204 } 205 206 public static void main(String[] args) { 207 Phonem inst = new Phonem(); 208 for (String arg : args) 209 System.out.println(inst.toFuzzy(arg)); 210 } 211}