001/* ***** BEGIN LICENSE BLOCK *****
002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
003 *
004 * The contents of this file are subject to the Mozilla Public License Version
005 * 1.1 (the "License"); you may not use this file except in compliance with
006 * the License. You may obtain a copy of the License at
007 * http://www.mozilla.org/MPL/
008 *
009 * Software distributed under the License is distributed on an "AS IS" basis,
010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
011 * for the specific language governing rights and limitations under the
012 * License.
013 *
014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in
015 * Java(TM), hosted at https://github.com/gunterze/dcm4che.
016 *
017 * The Initial Developer of the Original Code is
018 * Agfa Healthcare.
019 * Portions created by the Initial Developer are Copyright (C) 2011
020 * the Initial Developer. All Rights Reserved.
021 *
022 * Contributor(s):
023 * See listed authors below.
024 *
025 * Alternatively, the contents of this file may be used under the terms of
026 * either the GNU General Public License Version 2 or later (the "GPL"), or
027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
028 * in which case the provisions of the GPL or the LGPL are applicable instead
029 * of those above. If you wish to allow use of your version of this file only
030 * under the terms of either the GPL or the LGPL, and not to allow others to
031 * use your version of this file under the terms of the MPL, indicate your
032 * decision by deleting the provisions above and replace them with the notice
033 * and other provisions required by the GPL or the LGPL. If you do not delete
034 * the provisions above, a recipient may use your version of this file under
035 * the terms of any one of the MPL, the GPL or the LGPL.
036 *
037 * ***** END LICENSE BLOCK ***** */
038
039package org.dcm4che3.soundex;
040
041/**
042 * @author Gunter Zeilinger <gunterze@gmail.com>
043 */
044public class Metaphone implements FuzzyStr {
045
046    @Override
047    public String toFuzzy(String s) {
048        if (s == null || s.length() == 0)
049            return "";
050
051        char[] in = s.toUpperCase().toCharArray();
052        int countX = 0;
053        for (char c : in)
054            if (c == 'X')
055                countX++;
056        char[] out = countX > 0 ? new char[in.length + countX] : in;
057        int i = 0;
058        int j = 0;
059        char prev = 0;
060        char cur = 0;
061        char next1 = in[0];
062        char next2 = in.length > 1 ? in[1] : 0;
063        
064        // Initial  kn-, gn- pn, ae- or wr-      -> drop first letter
065        if (next2 == 'N' && (next1 == 'K' || next1 == 'G' || next1 == 'P')
066                || next1 == 'A' && next2 == 'E'
067                || next1 == 'W' && next2 == 'R') {
068            next1 = next2;
069            next2 = in.length > 2 ? in[2] : 0;
070            i++;
071        // Initial  x-                           -> change to "s"
072        } else if (next1 == 'X') {
073            next1 = 'S';
074        // Initial  wh-                          -> change to "w"
075        } else if (next1 == 'W' && next2 == 'H') {
076            next2 = in.length > 2 ? in[2] : 0;
077            i++;
078        }
079        for (; i < in.length; i++) {
080            prev = cur;
081            cur = next1;
082            next1 = next2;
083            next2 = i+2 < in.length ? in[i+2] : 0;
084            // Doubled letters except "g" -> drop 2nd letter.
085            if (cur == prev && cur != 'C')
086                continue;
087            
088            switch (cur) {
089            // Vowels are only kept when they are the first letter.
090            case 'A':
091            case 'E':
092            case 'I':
093            case 'O':
094            case 'U':
095                if (j == 0)
096                    out[j++] = cur;
097                break;
098            // B -> B   unless at the end of a word after "m" as in "dumb"
099            case 'B':
100                if (!(next1 == 0 && prev == 'M'))
101                    out[j++] = cur;
102                break;
103            // C -> X    (sh) if -cia- or -ch-
104            //            S   if -ci-, -ce- or -cy-
105            //            SILENT if "-sci-", "-sce-", or "-scy-"
106            //            K   otherwise, including -sch-
107            case 'C':
108                if (next1 == 'I' || next1 == 'E' || next1 == 'Y') {
109                    if (prev != 'S')
110                        out[j++] = next1 == 'I' && next2 == 'A' ? 'X' : 'S';
111                } else
112                    out[j++] = next1 == 'H' && prev != 'S' ? 'X' : 'K';
113                break;
114            // D -> J   if in -dge-, -dgy- or -dgi-
115            //      T   otherwise
116            case 'D':
117                out[j++] = next1 == 'G'
118                        && (next2 == 'I' || next2 == 'E' || next2 == 'Y') 
119                        ? 'J' : 'T';
120                break;
121            // F -> F
122            // J -> J
123            // L -> L
124            // M -> M
125            // N -> N
126            // R -> R
127            case 'F':
128            case 'J':
129            case 'L':
130            case 'M':
131            case 'N':
132            case 'R':
133                out[j++] = cur;
134                break;
135            // G ->     silent if in -gh- and not at end or before a vowel
136            //          in -gn or -gned (also see dge etc. above)
137            //      J   if before i or e or y if not double gg
138            //      K   otherwise
139            case 'G':
140                if (next1 == 'H' && next2 != 0 && !vowel(next2)
141                        || next1 == 'N' 
142                                && (next2 == 0 || next2 == 'E' 
143                                        && in.length == (i+4) && in[3] == 'D')
144                        || prev == 'D' 
145                                && (next1 == 'I' || next1 == 'E' || next1 == 'Y'))
146                    continue;
147                // if double gg, next1 == 'G' -> K
148                out[j++] = (next1 == 'I' || next1 == 'E' || next1 == 'Y') 
149                        ? 'J' : 'K';
150                break;
151            // H ->     silent if after vowel and no vowel follows
152            //          or in "-ch-", "-sh-", "-ph-", "-th-", "-gh-"
153            //      H   otherwise
154            case 'H':
155                switch (prev) {
156                case 'A':
157                case 'E':
158                case 'I':
159                case 'O':
160                case 'U':
161                    if (!vowel(next1))
162                        continue;
163                    break;
164                case 'C':
165                case 'S':
166                case 'P':
167                case 'T':
168                case 'G':
169                    continue;
170                }
171                out[j++] = cur;
172                break;
173            // K ->     silent if after "c"
174            //      K   otherwise
175            case 'K':
176                if (prev != 'C')
177                    out[j++] = cur;
178                break;
179            // P -> F   if before "h"
180            //      P   otherwise
181            case 'P':
182                out[j++] = (next1 == 'H') ? 'F' : 'P';
183                break;
184            // Q -> K
185            case 'Q':
186                out[j++] = 'K';
187                break;
188            // S -> X   (sh) if before "h" or in -sio- or -sia-
189            //      S   otherwise
190            case 'S':
191                out[j++] = next1 == 'H' 
192                        || next1 == 'I' && (next2 == 'O' || next2 == 'A')
193                        ? 'X' : 'S';
194                break;
195            // T -> X   (sh) if -tia- or -tio-
196            //      0   (th) if before "h"
197            //          silent if in -tch-
198            //      T   otherwise
199            case 'T':
200                if (!(next1 == 'C' || next2 == 'H'))
201                    out[j++] = next1 == 'I' && (next2 == 'A' || next2 == 'O')
202                            ? 'X' : (next1 == 'H') ? '0' : 'T';
203                break;
204            // V -> F
205            case 'V':
206                out[j++] = 'F';
207                break;
208            // W ->     silent if not followed by a vowel
209            //      W   if followed by a vowel
210            // Y ->     silent if not followed by a vowel
211            //      Y   if followed by a vowel
212            case 'W':
213            case 'Y':
214                if (vowel(next1))
215                    out[j++] = cur;
216                break;
217            // X -> KS
218            case 'X':
219                out[j++] = 'K';
220                out[j++] = 'S';
221                break;
222            // Z -> S 
223            case 'Z':
224                out[j++] = 'S';
225                break;
226            default:
227                continue;
228            }
229        }
230        return new String(out, 0, j);
231    }
232
233    private static boolean vowel(char ch) {
234        return ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U';
235    }
236
237    public static void main(String[] args) {
238        Metaphone inst = new Metaphone();
239        for (String arg : args)
240            System.out.println(inst.toFuzzy(arg));
241    }
242
243}