Source code

001/* ***** BEGIN LICENSE BLOCK *****
002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
003 *
004 * The contents of this file are subject to the Mozilla Public License Version
005 * 1.1 (the "License"); you may not use this file except in compliance with
006 * the License. You may obtain a copy of the License at
007 * http://www.mozilla.org/MPL/
008 *
009 * Software distributed under the License is distributed on an "AS IS" basis,
010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
011 * for the specific language governing rights and limitations under the
012 * License.
013 *
014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in
015 * Java(TM), hosted at https://github.com/gunterze/dcm4che.
016 *
017 * The Initial Developer of the Original Code is
018 * Agfa Healthcare.
019 * Portions created by the Initial Developer are Copyright (C) 2011
020 * the Initial Developer. All Rights Reserved.
021 *
022 * Contributor(s):
023 * See listed authors below.
024 *
025 * Alternatively, the contents of this file may be used under the terms of
026 * either the GNU General Public License Version 2 or later (the "GPL"), or
027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
028 * in which case the provisions of the GPL or the LGPL are applicable instead
029 * of those above. If you wish to allow use of your version of this file only
030 * under the terms of either the GPL or the LGPL, and not to allow others to
031 * use your version of this file under the terms of the MPL, indicate your
032 * decision by deleting the provisions above and replace them with the notice
033 * and other provisions required by the GPL or the LGPL. If you do not delete
034 * the provisions above, a recipient may use your version of this file under
035 * the terms of any one of the MPL, the GPL or the LGPL.
036 *
037 * ***** END LICENSE BLOCK ***** */
038
039package org.dcm4che3.soundex;
040
041/**
042 * Implementation of the PHONEM substitutions as described in Georg Wilde and
043 * Carsten Meyer, Doppelgaenger gesucht - Ein Programm fuer kontextsensitive
044 * phonetische Textumwandlung ct Magazin fuer Computer & Technik 25/1998.
045 * 
046 * @see <a href="http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf"
047 *      >Martin Wilz: Aspekte der Kodierung phonetischer Ähnlichkeiten in
048 *      deutschen Eigennamen</a>
049 * 
050 * @author Gunter Zeilinger <gunterze@gmail.com>
051 */
052public class Phonem implements FuzzyStr {
053
054    @Override
055    public String toFuzzy(String s) {
056        if (s == null || s.length() == 0)
057            return "";
058
059        char[] in = s.toUpperCase().toCharArray();
060        char next = in[0];
061        int j = 0;
062        for (int i = 1; i < in.length; i++) {
063            char prev = next;
064            switch ((prev << 8) + (next = in[i])) {
065            case 0x5343:        // SC
066            case 0x535a:        // SZ
067            case 0x435a:        // CZ
068            case 0x5453:        // TS
069                next = 'C';
070                break;
071            case 0x4b53:        // KS
072                next = 'X';
073                break;
074            case 0x5046:        // PF
075            case 0x5048:        // PH
076                next = 'V';
077                break;
078            case 0x5545:        // UE
079                next = 'Y';
080                break;
081            case 0x4145:        // AE
082                prev = 'E';
083                break;
084            case 0x4f45:        // OE
085                next = 'Ö';
086                break;
087            case 0x4f55:        // OU
088                next = '§';
089                break;
090            case 0x5155:        // QU
091                in[j++] = 'K';
092                next = 'W';
093                break;
094            case 0x4549:        // EI
095            case 0x4559:        // EY
096                in[j++] = 'A';
097                next = 'Y';
098                break;
099            case 0x4555:        // EU
100                in[j++] = 'O';
101                next = 'Y';
102                break;
103            case 0x4155:        // AU
104                in[j++] = 'A';
105                next = '§';
106                break;
107            default:
108                in[j++] = prev;
109                break;
110            }
111        }
112        in[j++] = next;
113        int k = 0;
114        char prev = 0;
115        for (int i = 0; i < j; i++) {
116            char ch = in[i];
117            switch (ch) {
118            case 'Z':
119            case 'K':
120            case 'G':
121            case 'Q':
122            case 'Ç':
123                ch = 'C';
124                break;
125            case 'À':
126            case 'Á':
127            case 'Â':
128            case 'Ã':
129            case 'Å':
130                ch = 'A';
131                break;
132            case 'Ä':
133            case 'Æ':
134            case 'È':
135            case 'É':
136            case 'Ê':
137            case 'Ë':
138                ch = 'E';
139                break;
140            case 'I':
141            case 'J':
142            case 'Ì':
143            case 'Í':
144            case 'Î':
145            case 'Ï':
146            case 'Ü':
147            case 'Ý':
148                ch = 'Y';
149                break;
150            case 'Ñ':
151                ch = 'N';
152                break;
153            case 'Ò':
154            case 'Ó':
155            case 'Ô':
156            case 'Õ':
157                ch = 'O';
158                break;
159            case 'Ø':
160                ch = 'Ö';
161                break;
162            case 'ß':
163                ch = 'S';
164                break;
165            case 'F':
166            case 'W':
167                ch = 'V';
168                break;
169            case 'P':
170                ch = 'B';
171                break;
172            case 'T':
173                ch = 'D';
174                break;
175            case '§':
176            case 'Ù':
177            case 'Ú':
178            case 'Û':
179                ch = 'U';
180                break;
181            case 'A':
182            case 'B':
183            case 'C':
184            case 'D':
185            case 'L':
186            case 'M':
187            case 'N':
188            case 'O':
189            case 'R':
190            case 'S':
191            case 'U':
192            case 'V':
193            case 'X':
194            case 'Y':
195            case 'Ö':
196                break;
197            default:
198                continue;
199            }
200            if (ch != prev)
201                in[k++] = prev = ch;
202        }
203        return new String(in, 0, k);
204    }
205
206    public static void main(String[] args) {
207        Phonem inst = new Phonem();
208        for (String arg : args)
209            System.out.println(inst.toFuzzy(arg));
210    }
211}