001/* ***** BEGIN LICENSE BLOCK ***** 002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 003 * 004 * The contents of this file are subject to the Mozilla Public License Version 005 * 1.1 (the "License"); you may not use this file except in compliance with 006 * the License. You may obtain a copy of the License at 007 * http://www.mozilla.org/MPL/ 008 * 009 * Software distributed under the License is distributed on an "AS IS" basis, 010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 011 * for the specific language governing rights and limitations under the 012 * License. 013 * 014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in 015 * Java(TM), hosted at https://github.com/gunterze/dcm4che. 016 * 017 * The Initial Developer of the Original Code is 018 * Agfa Healthcare. 019 * Portions created by the Initial Developer are Copyright (C) 2011 020 * the Initial Developer. All Rights Reserved. 021 * 022 * Contributor(s): 023 * See listed authors below. 024 * 025 * Alternatively, the contents of this file may be used under the terms of 026 * either the GNU General Public License Version 2 or later (the "GPL"), or 027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 028 * in which case the provisions of the GPL or the LGPL are applicable instead 029 * of those above. If you wish to allow use of your version of this file only 030 * under the terms of either the GPL or the LGPL, and not to allow others to 031 * use your version of this file under the terms of the MPL, indicate your 032 * decision by deleting the provisions above and replace them with the notice 033 * and other provisions required by the GPL or the LGPL. If you do not delete 034 * the provisions above, a recipient may use your version of this file under 035 * the terms of any one of the MPL, the GPL or the LGPL. 036 * 037 * ***** END LICENSE BLOCK ***** */ 038 039package org.dcm4che3.soundex; 040 041/** 042 * @author Gunter Zeilinger <gunterze@gmail.com> 043 */ 044public class Metaphone implements FuzzyStr { 045 046 @Override 047 public String toFuzzy(String s) { 048 if (s == null || s.length() == 0) 049 return ""; 050 051 char[] in = s.toUpperCase().toCharArray(); 052 int countX = 0; 053 for (char c : in) 054 if (c == 'X') 055 countX++; 056 char[] out = countX > 0 ? new char[in.length + countX] : in; 057 int i = 0; 058 int j = 0; 059 char prev = 0; 060 char cur = 0; 061 char next1 = in[0]; 062 char next2 = in.length > 1 ? in[1] : 0; 063 064 // Initial kn-, gn- pn, ae- or wr- -> drop first letter 065 if (next2 == 'N' && (next1 == 'K' || next1 == 'G' || next1 == 'P') 066 || next1 == 'A' && next2 == 'E' 067 || next1 == 'W' && next2 == 'R') { 068 next1 = next2; 069 next2 = in.length > 2 ? in[2] : 0; 070 i++; 071 // Initial x- -> change to "s" 072 } else if (next1 == 'X') { 073 next1 = 'S'; 074 // Initial wh- -> change to "w" 075 } else if (next1 == 'W' && next2 == 'H') { 076 next2 = in.length > 2 ? in[2] : 0; 077 i++; 078 } 079 for (; i < in.length; i++) { 080 prev = cur; 081 cur = next1; 082 next1 = next2; 083 next2 = i+2 < in.length ? in[i+2] : 0; 084 // Doubled letters except "g" -> drop 2nd letter. 085 if (cur == prev && cur != 'C') 086 continue; 087 088 switch (cur) { 089 // Vowels are only kept when they are the first letter. 090 case 'A': 091 case 'E': 092 case 'I': 093 case 'O': 094 case 'U': 095 if (j == 0) 096 out[j++] = cur; 097 break; 098 // B -> B unless at the end of a word after "m" as in "dumb" 099 case 'B': 100 if (!(next1 == 0 && prev == 'M')) 101 out[j++] = cur; 102 break; 103 // C -> X (sh) if -cia- or -ch- 104 // S if -ci-, -ce- or -cy- 105 // SILENT if "-sci-", "-sce-", or "-scy-" 106 // K otherwise, including -sch- 107 case 'C': 108 if (next1 == 'I' || next1 == 'E' || next1 == 'Y') { 109 if (prev != 'S') 110 out[j++] = next1 == 'I' && next2 == 'A' ? 'X' : 'S'; 111 } else 112 out[j++] = next1 == 'H' && prev != 'S' ? 'X' : 'K'; 113 break; 114 // D -> J if in -dge-, -dgy- or -dgi- 115 // T otherwise 116 case 'D': 117 out[j++] = next1 == 'G' 118 && (next2 == 'I' || next2 == 'E' || next2 == 'Y') 119 ? 'J' : 'T'; 120 break; 121 // F -> F 122 // J -> J 123 // L -> L 124 // M -> M 125 // N -> N 126 // R -> R 127 case 'F': 128 case 'J': 129 case 'L': 130 case 'M': 131 case 'N': 132 case 'R': 133 out[j++] = cur; 134 break; 135 // G -> silent if in -gh- and not at end or before a vowel 136 // in -gn or -gned (also see dge etc. above) 137 // J if before i or e or y if not double gg 138 // K otherwise 139 case 'G': 140 if (next1 == 'H' && next2 != 0 && !vowel(next2) 141 || next1 == 'N' 142 && (next2 == 0 || next2 == 'E' 143 && in.length == (i+4) && in[3] == 'D') 144 || prev == 'D' 145 && (next1 == 'I' || next1 == 'E' || next1 == 'Y')) 146 continue; 147 // if double gg, next1 == 'G' -> K 148 out[j++] = (next1 == 'I' || next1 == 'E' || next1 == 'Y') 149 ? 'J' : 'K'; 150 break; 151 // H -> silent if after vowel and no vowel follows 152 // or in "-ch-", "-sh-", "-ph-", "-th-", "-gh-" 153 // H otherwise 154 case 'H': 155 switch (prev) { 156 case 'A': 157 case 'E': 158 case 'I': 159 case 'O': 160 case 'U': 161 if (!vowel(next1)) 162 continue; 163 break; 164 case 'C': 165 case 'S': 166 case 'P': 167 case 'T': 168 case 'G': 169 continue; 170 } 171 out[j++] = cur; 172 break; 173 // K -> silent if after "c" 174 // K otherwise 175 case 'K': 176 if (prev != 'C') 177 out[j++] = cur; 178 break; 179 // P -> F if before "h" 180 // P otherwise 181 case 'P': 182 out[j++] = (next1 == 'H') ? 'F' : 'P'; 183 break; 184 // Q -> K 185 case 'Q': 186 out[j++] = 'K'; 187 break; 188 // S -> X (sh) if before "h" or in -sio- or -sia- 189 // S otherwise 190 case 'S': 191 out[j++] = next1 == 'H' 192 || next1 == 'I' && (next2 == 'O' || next2 == 'A') 193 ? 'X' : 'S'; 194 break; 195 // T -> X (sh) if -tia- or -tio- 196 // 0 (th) if before "h" 197 // silent if in -tch- 198 // T otherwise 199 case 'T': 200 if (!(next1 == 'C' || next2 == 'H')) 201 out[j++] = next1 == 'I' && (next2 == 'A' || next2 == 'O') 202 ? 'X' : (next1 == 'H') ? '0' : 'T'; 203 break; 204 // V -> F 205 case 'V': 206 out[j++] = 'F'; 207 break; 208 // W -> silent if not followed by a vowel 209 // W if followed by a vowel 210 // Y -> silent if not followed by a vowel 211 // Y if followed by a vowel 212 case 'W': 213 case 'Y': 214 if (vowel(next1)) 215 out[j++] = cur; 216 break; 217 // X -> KS 218 case 'X': 219 out[j++] = 'K'; 220 out[j++] = 'S'; 221 break; 222 // Z -> S 223 case 'Z': 224 out[j++] = 'S'; 225 break; 226 default: 227 continue; 228 } 229 } 230 return new String(out, 0, j); 231 } 232 233 private static boolean vowel(char ch) { 234 return ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U'; 235 } 236 237 public static void main(String[] args) { 238 Metaphone inst = new Metaphone(); 239 for (String arg : args) 240 System.out.println(inst.toFuzzy(arg)); 241 } 242 243}