001/* ***** BEGIN LICENSE BLOCK *****
002 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
003 *
004 * The contents of this file are subject to the Mozilla Public License Version
005 * 1.1 (the "License"); you may not use this file except in compliance with
006 * the License. You may obtain a copy of the License at
007 * http://www.mozilla.org/MPL/
008 *
009 * Software distributed under the License is distributed on an "AS IS" basis,
010 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
011 * for the specific language governing rights and limitations under the
012 * License.
013 *
014 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in
015 * Java(TM), hosted at http://sourceforge.net/projects/dcm4che.
016 *
017 * The Initial Developer of the Original Code is
018 * Gunter Zeilinger, Huetteldorferstr. 24/10, 1150 Vienna/Austria/Europe.
019 * Portions created by the Initial Developer are Copyright (C) 2010
020 * the Initial Developer. All Rights Reserved.
021 *
022 * Contributor(s):
023 * Gunter Zeilinger <gunterze@gmail.com>
024 *
025 * Alternatively, the contents of this file may be used under the terms of
026 * either the GNU General Public License Version 2 or later (the "GPL"), or
027 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
028 * in which case the provisions of the GPL or the LGPL are applicable instead
029 * of those above. If you wish to allow use of your version of this file only
030 * under the terms of either the GPL or the LGPL, and not to allow others to
031 * use your version of this file under the terms of the MPL, indicate your
032 * decision by deleting the provisions above and replace them with the notice
033 * and other provisions required by the GPL or the LGPL. If you do not delete
034 * the provisions above, a recipient may use your version of this file under
035 * the terms of any one of the MPL, the GPL or the LGPL.
036 *
037 * ***** END LICENSE BLOCK ***** */
038
039/* ***** BEGIN LICENSE BLOCK *****
040 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
041 *
042 * The contents of this file are subject to the Mozilla Public License Version
043 * 1.1 (the "License"); you may not use this file except in compliance with
044 * the License. You may obtain a copy of the License at
045 * http://www.mozilla.org/MPL/
046 *
047 * Software distributed under the License is distributed on an "AS IS" basis,
048 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
049 * for the specific language governing rights and limitations under the
050 * License.
051 *
052 * The Original Code is part of dcm4che, an implementation of DICOM(TM) in
053 * Java(TM), hosted at https://github.com/gunterze/dcm4che.
054 *
055 * The Initial Developer of the Original Code is
056 * Agfa Healthcare.
057 * Portions created by the Initial Developer are Copyright (C) 2011
058 * the Initial Developer. All Rights Reserved.
059 *
060 * Contributor(s):
061 * See @authors listed below
062 *
063 * Alternatively, the contents of this file may be used under the terms of
064 * either the GNU General Public License Version 2 or later (the "GPL"), or
065 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
066 * in which case the provisions of the GPL or the LGPL are applicable instead
067 * of those above. If you wish to allow use of your version of this file only
068 * under the terms of either the GPL or the LGPL, and not to allow others to
069 * use your version of this file under the terms of the MPL, indicate your
070 * decision by deleting the provisions above and replace them with the notice
071 * and other provisions required by the GPL or the LGPL. If you do not delete
072 * the provisions above, a recipient may use your version of this file under
073 * the terms of any one of the MPL, the GPL or the LGPL.
074 *
075 * ***** END LICENSE BLOCK ***** */
076
077package org.dcm4che3.data;
078
079import java.io.UnsupportedEncodingException;
080import java.lang.ref.SoftReference;
081import java.nio.ByteBuffer;
082import java.nio.CharBuffer;
083import java.nio.charset.*;
084import java.util.Arrays;
085import java.util.StringTokenizer;
086
087/**
088 * @author Gunter Zeilinger <gunterze@gmail.com>
089 */
090public class SpecificCharacterSet {
091    
092    public static final SpecificCharacterSet ASCII = new SpecificCharacterSet(new Codec[]{Codec.ISO_646});
093
094    public static SpecificCharacterSet DEFAULT = ASCII;
095
096    private static ThreadLocal<SoftReference<Encoder>> cachedEncoder1 = new ThreadLocal<SoftReference<Encoder>>();
097    private static ThreadLocal<SoftReference<Encoder>> cachedEncoder2 = new ThreadLocal<SoftReference<Encoder>>();
098
099    protected final Codec[] codecs;
100    protected final String[] dicomCodes;
101
102    private enum Codec {
103        ISO_646("US-ASCII", true, 0x2842, 0, 1),
104        ISO_8859_1("ISO-8859-1", true, 0x2842, 0x2d41, 1),
105        ISO_8859_2("ISO-8859-2", true, 0x2842, 0x2d42, 1),
106        ISO_8859_3("ISO-8859-3", true, 0x2842, 0x2d43, 1),
107        ISO_8859_4("ISO-8859-4", true, 0x2842, 0x2d44, 1),
108        ISO_8859_5("ISO-8859-5", true, 0x2842, 0x2d4c, 1),
109        ISO_8859_6("ISO-8859-6", true, 0x2842, 0x2d47, 1),
110        ISO_8859_7("ISO-8859-7", true, 0x2842, 0x2d46, 1),
111        ISO_8859_8("ISO-8859-8", true, 0x2842, 0x2d48, 1),
112        ISO_8859_9("ISO-8859-9", true, 0x2842, 0x2d4d, 1),
113        JIS_X_201("JIS_X0201", true, 0x284a, 0x2949, 1) {
114            @Override
115            public String toText(String s) {
116                return s.replace('\\', '¥');
117            }
118        },
119        TIS_620("TIS-620", true, 0x2842, 0x2d54, 1),
120        JIS_X_208("x-JIS0208", false, 0x2442, 0, 1),
121        JIS_X_212("JIS_X0212-1990", false, 0x242844, 0, 2),
122        KS_X_1001("EUC-KR", false, 0x2842, 0x242943, -1),
123        GB2312("GB2312", false, 0x2842, 0x242941, -1),
124        UTF_8("UTF-8", true, 0, 0, -1),
125        GB18030("GB18030", false, 0, 0, -1);
126
127        private final String charsetName;
128        private final boolean containsASCII;
129        private final int escSeq0;
130        private final int escSeq1;
131        private final int bytesPerChar;
132
133        Codec(String charsetName, boolean containsASCII, int escSeq0, int escSeq1, int bytesPerChar) {
134            this.charsetName = charsetName;
135            this.containsASCII = containsASCII;
136            this.escSeq0 = escSeq0;
137            this.escSeq1 = escSeq1;
138            this.bytesPerChar = bytesPerChar;
139        }
140
141        public static Codec forCode(String code) {
142            if (code == null)
143                return ISO_646;
144
145            switch(last2digits(code)) {
146            case 0:
147                if (code.equals("ISO_IR 100") || code.equals("ISO 2022 IR 100"))
148                    return Codec.ISO_8859_1;
149                break;
150            case 1:
151                if (code.equals("ISO_IR 101") || code.equals("ISO 2022 IR 101"))
152                    return Codec.ISO_8859_2;
153                break;
154            case 6:
155                if (code.equals("ISO 2022 IR 6"))
156                    return Codec.ISO_646;
157                break;
158            case 9:
159                if (code.equals("ISO_IR 109") || code.equals("ISO 2022 IR 109"))
160                    return Codec.ISO_8859_3;
161                break;
162            case 10:
163                if (code.equals("ISO_IR 110") || code.equals("ISO 2022 IR 110"))
164                    return Codec.ISO_8859_4;
165                break;
166            case 13:
167                if (code.equals("ISO_IR 13") || code.equals("ISO 2022 IR 13"))
168                    return Codec.JIS_X_201;
169                break;
170            case 26:
171                if (code.equals("ISO_IR 126") || code.equals("ISO 2022 IR 126"))
172                    return Codec.ISO_8859_7;
173                break;
174            case 27:
175                if (code.equals("ISO_IR 127") || code.equals("ISO 2022 IR 127"))
176                    return Codec.ISO_8859_6;
177                break;
178            case 30:
179                if (code.equals("GB18030"))
180                    return Codec.GB18030;
181                break;
182            case 31:
183                if (code.equals("GBK"))
184                    return Codec.GB18030;
185                break;
186            case 38:
187                if (code.equals("ISO_IR 138") || code.equals("ISO 2022 IR 138"))
188                    return Codec.ISO_8859_8;
189                break;
190            case 44:
191                if (code.equals("ISO_IR 144") || code.equals("ISO 2022 IR 144"))
192                    return Codec.ISO_8859_5;
193                break;
194            case 48:
195                if (code.equals("ISO_IR 148") || code.equals("ISO 2022 IR 148"))
196                    return Codec.ISO_8859_9;
197                break;
198            case 49:
199                if (code.equals("ISO 2022 IR 149"))
200                    return Codec.KS_X_1001;
201                break;
202            case 58:
203                if (code.equals("ISO 2022 IR 58"))
204                    return Codec.GB2312;
205                break;
206            case 59:
207                if (code.equals("ISO 2022 IR 159"))
208                    return Codec.JIS_X_212;
209                break;
210            case 66:
211                if (code.equals("ISO_IR 166") || code.equals("ISO 2022 IR 166"))
212                    return Codec.TIS_620;
213                break;
214            case 87:
215                if (code.equals("ISO 2022 IR 87"))
216                    return Codec.JIS_X_208;
217                break;
218            case 92:
219                if (code.equals("ISO_IR 192"))
220                    return Codec.UTF_8;
221                break;
222            }
223            return ISO_646;
224        }
225
226        private static int last2digits(String code) {
227            int len = code.length();
228            if (len < 2)
229                return -1;
230            char ch1 = code.charAt(len-1);
231            char ch2 = code.charAt(len-2);
232            return (ch2 & 15) * 10 + (ch1 & 15);
233        }
234
235        public byte[] encode(String val) {
236            try {
237                return val.getBytes(charsetName);
238            } catch (UnsupportedEncodingException e) {
239                throw new AssertionError(e);
240            }
241        }
242
243        public String decode(byte[] b, int off, int len) {
244            try {
245                return new String(b, off, len, charsetName);
246            } catch (UnsupportedEncodingException e) {
247                throw new AssertionError(e);
248            }
249        }
250
251        public boolean containsASCII() {
252            return containsASCII;
253        }
254
255        public int getEscSeq0() {
256            return escSeq0;
257        }
258
259        public int getEscSeq1() {
260            return escSeq1;
261        }
262
263        public int getBytesPerChar() {
264            return bytesPerChar;
265        }
266
267        public String toText(String s) {
268            return s;
269        }
270    }
271
272    private static final class Encoder {
273        final Codec codec;
274        final CharsetEncoder encoder;
275 
276        public Encoder(Codec codec) {
277            this.codec = codec;
278            this.encoder = Charset.forName(codec.charsetName).newEncoder();
279        }
280
281        public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq,
282                CodingErrorAction errorAction) {
283            encoder.onMalformedInput(errorAction)
284                    .onUnmappableCharacter(errorAction)
285                    .reset();
286            int cbmark = cb.position();
287            int bbmark = bb.position();
288            try {
289                escSeq(bb, escSeq);
290                CoderResult cr = encoder.encode(cb, bb, true);
291                if (!cr.isUnderflow())
292                    cr.throwException();
293                cr = encoder.flush(bb);
294                if (!cr.isUnderflow())
295                    cr.throwException();
296            } catch (CharacterCodingException x) {
297                cb.position(cbmark);
298                bb.position(bbmark);
299                return false;
300            }
301            return true;
302        }
303
304        private static void escSeq(ByteBuffer bb, int seq) {
305            if (seq == 0)
306                return;
307
308            bb.put((byte) 0x1b);
309            int b1 = seq >> 16;
310            if (b1 != 0)
311                bb.put((byte) b1);
312            bb.put((byte) (seq >> 8));
313            bb.put((byte) seq);
314        }
315
316        public byte[] replacement() {
317            return encoder.replacement();
318        }
319    }
320
321    private static final class ISO2022 extends SpecificCharacterSet {
322
323        private ISO2022(Codec[] charsetInfos, String... codes) {
324            super(charsetInfos, codes);
325        }
326
327        @Override
328        public byte[] encode(String val, String delimiters) {
329            int strlen = val.length();
330            CharBuffer cb = CharBuffer.wrap(val.toCharArray());
331            Encoder enc1 = encoder(cachedEncoder1, codecs[0]);
332            byte[] buf = new byte[strlen];
333            ByteBuffer bb = ByteBuffer.wrap(buf);
334            // try to encode whole string value with character set specified
335            // by value1 of (0008,0005) Specific Character Set
336            if (!enc1.encode(cb, bb, 0, CodingErrorAction.REPORT)) {
337                // split whole string value according VR specific delimiters
338                // and try to encode each component separately
339                Encoder[] encs = new Encoder[codecs.length];
340                encs[0] = enc1;
341                encs[1] = encoder(cachedEncoder2, codecs[1]);
342                StringTokenizer comps = new StringTokenizer(val, delimiters, true);
343                buf = new byte[2 * strlen + 4 * (comps.countTokens() + 1)];
344                bb = ByteBuffer.wrap(buf);
345                int[] cur = { 0, 0 };
346                while (comps.hasMoreTokens()) {
347                    String comp = comps.nextToken();
348                    if (comp.length() == 1 && delimiters.indexOf(comp.charAt(0)) >= 0) { // if delimiter
349                        activateInitialCharacterSet(bb, cur);
350                        bb.put((byte) comp.charAt(0));
351                        continue;
352                    }
353                    cb = CharBuffer.wrap(comp.toCharArray());
354                    encodeComponent(encs, cb, bb, cur);
355                }
356                activateInitialCharacterSet(bb, cur);
357            }
358            return Arrays.copyOf(buf, bb.position());
359        }
360
361        private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] cur) {
362            // try to encode component with current active character of G1
363            if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, CodingErrorAction.REPORT))
364                return;
365
366            // try to encode component with current active character set of G0, if different to G1
367            if ((codecs[cur[1]].getEscSeq1() == 0 || codecs[cur[1]].getEscSeq0() != codecs[cur[0]].getEscSeq0())
368                    && encs[cur[0]].encode(cb, bb, 0, CodingErrorAction.REPORT))
369                return;
370
371            int next = encs.length;
372            while (--next >= 0) {
373                if (encs[next] == null)
374                    encs[next] = new Encoder(codecs[next]);
375                if (codecs[next].getEscSeq1() != 0) {
376                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), CodingErrorAction.REPORT)) {
377                        cur[1] = next;
378                        break;
379                    }
380                } else {
381                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), CodingErrorAction.REPORT)) {
382                        cur[0] = next;
383                        break;
384                    }
385                }
386            }
387            if (next < 0) {
388                if (cb.length() > 1) {
389                    for (int i = 0; i < cb.length(); i++) {
390                        encodeComponent(encs, cb.subSequence(i, i + 1), bb, cur);
391                    }
392                } else {
393                    // character could not be encoded with any of the
394                    // specified character sets, encode it with the
395                    // current character set of G0, using the default
396                    // replacement of the character set decoder
397                    // for characters which cannot be encoded
398                    bb.put(encs[cur[0]].replacement());
399                }
400            }
401        }
402
403        private void activateInitialCharacterSet(ByteBuffer bb, int[] cur) {
404            if (cur[0] != 0) {
405                Encoder.escSeq(bb, codecs[0].getEscSeq0());
406                cur[0] = 0;
407            }
408            if (cur[1] != 0) {
409                Encoder.escSeq(bb, codecs[0].getEscSeq1());
410                cur[1] = 0;
411            }
412        }
413
414        @Override
415        public String decode(byte[] b) {
416            Codec[] codec = { codecs[0], codecs[0] };
417            int g = 0;
418            int off = 0;
419            int cur = 0;
420            StringBuilder sb = new StringBuilder(b.length);
421            while (cur < b.length) {
422                if (b[cur] == 0x1b) { // ESC
423                    if (off < cur) {
424                        sb.append(codec[g].decode(b, off, cur - off));
425                    }
426                    cur += 3;
427                    switch (((b[cur - 2] & 255) << 8) + (b[cur - 1] & 255)) {
428                        case 0x2428:
429                            if (b[cur++] == 0x44) {
430                                codec[0] = Codec.JIS_X_212;
431                            } else { // decode invalid ESC sequence as chars
432                                sb.append(codec[0].decode(b, cur - 4, 4));
433                            }
434                            break;
435                        case 0x2429:
436                            switch (b[cur++]) {
437                                case 0x41:
438                                    switchCodec(codec, 1, Codec.GB2312);
439                                    break;
440                                case 0x43:
441                                    switchCodec(codec, 1, Codec.KS_X_1001);
442                                    break;
443                                default: // decode invalid ESC sequence as chars
444                                    sb.append(codec[0].decode(b, cur - 4, 4));
445                            }
446                            break;
447                        case 0x2442:
448                            codec[0] = Codec.JIS_X_208;
449                            break;
450                        case 0x2842:
451                            switchCodec(codec, 0, Codec.ISO_646);
452                            break;
453                        case 0x284a:
454                            codec[0] = Codec.JIS_X_201;
455                            if (codec[1].getEscSeq1() == 0)
456                                codec[1] = codec[0];
457                            break;
458                        case 0x2949:
459                            codec[1] = Codec.JIS_X_201;
460                            break;
461                        case 0x2d41:
462                            switchCodec(codec, 1, Codec.ISO_8859_1);
463                            break;
464                        case 0x2d42:
465                            switchCodec(codec, 1, Codec.ISO_8859_2);
466                            break;
467                        case 0x2d43:
468                            switchCodec(codec, 1, Codec.ISO_8859_3);
469                            break;
470                        case 0x2d44:
471                            switchCodec(codec, 1, Codec.ISO_8859_4);
472                            break;
473                        case 0x2d46:
474                            switchCodec(codec, 1, Codec.ISO_8859_7);
475                            break;
476                        case 0x2d47:
477                            switchCodec(codec, 1, Codec.ISO_8859_6);
478                            break;
479                        case 0x2d48:
480                            switchCodec(codec, 1, Codec.ISO_8859_8);
481                            break;
482                        case 0x2d4c:
483                            switchCodec(codec, 1, Codec.ISO_8859_5);
484                            break;
485                        case 0x2d4d:
486                            switchCodec(codec, 1, Codec.ISO_8859_9);
487                            break;
488                        case 0x2d54:
489                            switchCodec(codec, 1, Codec.TIS_620);
490                            break;
491                        default: // decode invalid ESC sequence as chars
492                            sb.append(codec[0].decode(b, cur - 3, 3));
493                    }
494                    off = cur;
495                } else {
496                    if (codec[0] != codec[1] && g == (b[cur] < 0 ? 0 : 1)) {
497                        if (off < cur) {
498                            sb.append(codec[g].decode(b, off, cur - off));
499                        }
500                        off = cur;
501                        g = 1 - g;
502                    }
503                    int bytesPerChar = codec[g].getBytesPerChar();
504                    cur += bytesPerChar > 0 ? bytesPerChar : b[cur] < 0 ? 2 : 1;
505                }
506            }
507            if (off < cur) {
508                sb.append(codec[g].decode(b, off, cur - off));
509            }
510            return sb.toString();
511        }
512
513        private void switchCodec(Codec[] codecs, int i, Codec codec) {
514            codecs[i] = codec;
515            if (codecs[0].getEscSeq0() == codecs[1].getEscSeq0())
516                codecs[0] = codecs[1];
517        }
518
519    }
520
521    public static SpecificCharacterSet getDefaultCharacterSet() {
522        return DEFAULT;
523    }
524
525    public static void setDefaultCharacterSet(String code) {
526        SpecificCharacterSet cs = code != null ? valueOf(code) : ASCII;
527        if (!cs.containsASCII())
528            throw new IllegalArgumentException("Default Character Set must contain ASCII - " + code);
529        DEFAULT = cs;
530    }
531
532    public static SpecificCharacterSet valueOf(String... codes) {
533        if (codes == null || codes.length == 0)
534            return DEFAULT;
535
536        Codec[] infos = new Codec[codes.length];
537        for (int i = 0; i < codes.length; i++)
538            infos[i] = Codec.forCode(codes[i]);
539        return codes.length > 1 ? new ISO2022(infos,codes)
540                : new SpecificCharacterSet(infos, codes);
541    }
542
543    public String[] toCodes () {
544        return dicomCodes;
545    }
546
547    private static Encoder encoder(ThreadLocal<SoftReference<Encoder>> tl,
548            Codec codec) {
549        SoftReference<Encoder> sr;
550        Encoder enc;
551        if ((sr = tl.get()) == null || (enc = sr.get()) == null
552                || enc.codec != codec)
553            tl.set(new SoftReference<Encoder>(enc = new Encoder(codec)));
554        return enc;
555    }
556
557    protected SpecificCharacterSet(Codec[] codecs, String... codes) {
558        this.codecs = codecs;
559        this.dicomCodes = codes;
560    }
561
562    public byte[] encode(String val, String delimiters) {
563        return codecs[0].encode(val);
564    }
565
566    public String decode(byte[] val) {
567        return codecs[0].decode(val, 0, val.length);
568    }
569
570    public boolean isUTF8() {
571        return codecs[0].equals(Codec.UTF_8);
572    }
573
574    public boolean isASCII() {
575        return codecs[0].equals(Codec.ISO_646);
576    }
577
578    public boolean containsASCII() {
579        return codecs[0].containsASCII();
580    }
581
582    public String toText(String s) {
583        return codecs[0].toText(s);
584    }
585
586    @Override public boolean equals(Object other) {
587
588        if (other == null) {
589            return false;
590        }
591        if (getClass() != other.getClass()) {
592            return false;
593        }
594        final SpecificCharacterSet othercs = (SpecificCharacterSet) other;
595        return Arrays.equals(this.codecs,othercs.codecs);
596    }
597
598    @Override
599    public int hashCode() {
600        return Arrays.hashCode(this.codecs);
601    }
602
603}