/* $Xorg: lcCT.c,v 1.4 2000/08/17 19:45:16 cpqbld Exp $ */ /* * Copyright 1992, 1993 by TOSHIBA Corp. * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose and without fee is hereby granted, provided * that the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of TOSHIBA not be used in advertising * or publicity pertaining to distribution of the software without specific, * written prior permission. TOSHIBA make no representations about the * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * * TOSHIBA DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL * TOSHIBA BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Katsuhisa Yano TOSHIBA Corp. * mopi@osa.ilab.toshiba.co.jp */ /* * Copyright 1995 by FUJITSU LIMITED * This is source code modified by FUJITSU LIMITED under the Joint * Development Agreement for the CDE/Motif PST. * * Modifier: Takanori Tateno FUJITSU LIMITED * */ /* * 2000 * Modifier: Ivan Pascal The XFree86 Project * Modifier: Bruno Haible The XFree86 Project */ /* $XFree86: xc/lib/X11/lcCT.c,v 3.26 2001/10/28 03:32:34 tsi Exp $ */ #include "Xlibint.h" #include "XlcPubI.h" #include #include /* ====================== Built-in Character Sets ====================== */ /* * Static representation of a character set that can be used in Compound Text. */ typedef struct _CTDataRec { const char *name; const char *ct_sequence; /* Compound Text encoding, ESC sequence */ } CTDataRec, *CTData; static CTDataRec default_ct_data[] = { /* */ /* X11 registry name MIME name ISO-IR ESC sequence */ /* */ /* Registered character sets with one byte per character */ { "ISO8859-1:GL", /* US-ASCII 6 */ "\033(B" }, { "ISO8859-1:GR", /* ISO-8859-1 100 */ "\033-A" }, { "ISO8859-2:GR", /* ISO-8859-2 101 */ "\033-B" }, { "ISO8859-3:GR", /* ISO-8859-3 109 */ "\033-C" }, { "ISO8859-4:GR", /* ISO-8859-4 110 */ "\033-D" }, { "ISO8859-5:GR", /* ISO-8859-5 144 */ "\033-L" }, { "ISO8859-6:GR", /* ISO-8859-6 127 */ "\033-G" }, { "ISO8859-7:GR", /* ISO-8859-7 126 */ "\033-F" }, { "ISO8859-8:GR", /* ISO-8859-8 138 */ "\033-H" }, { "ISO8859-9:GR", /* ISO-8859-9 148 */ "\033-M" }, { "ISO8859-10:GR", /* ISO-8859-10 157 */ "\033-V" }, { "ISO8859-11:GR", /* ISO-8859-11 166 */ "\033-T" }, { "ISO8859-13:GR", /* ISO-8859-13 179 */ "\033-Y" }, { "ISO8859-14:GR", /* ISO-8859-14 199 */ "\033-_" }, { "ISO8859-15:GR", /* ISO-8859-15 203 */ "\033-b" }, { "ISO8859-16:GR", /* ISO-8859-16 226 */ "\033-f" }, { "JISX0201.1976-0:GL", /* ISO-646-JP 14 */ "\033(J" }, { "JISX0201.1976-0:GR", "\033)I" }, #if 0 { "TIS620-0:GR", /* TIS-620 166 */ "\033-T" }, #endif /* Registered character sets with two byte per character */ { "GB2312.1980-0:GL", /* GB_2312-80 58 */ "\033$(A" }, { "GB2312.1980-0:GR", /* GB_2312-80 58 */ "\033$)A" }, { "JISX0208.1983-0:GL", /* JIS_X0208-1983 87 */ "\033$(B" }, { "JISX0208.1983-0:GR", /* JIS_X0208-1983 87 */ "\033$)B" }, { "JISX0208.1990-0:GL", /* JIS_X0208-1990 168 */ "\033$(B" }, { "JISX0208.1990-0:GR", /* JIS_X0208-1990 168 */ "\033$)B" }, { "JISX0212.1990-0:GL", /* JIS_X0212-1990 159 */ "\033$(D" }, { "JISX0212.1990-0:GR", /* JIS_X0212-1990 159 */ "\033$)D" }, { "KSC5601.1987-0:GL", /* KS_C_5601-1987 149 */ "\033$(C" }, { "KSC5601.1987-0:GR", /* KS_C_5601-1987 149 */ "\033$)C" }, { "CNS11643.1986-1:GL", /* CNS 11643-1992 pl.1 171 */ "\033$(G" }, { "CNS11643.1986-1:GR", /* CNS 11643-1992 pl.1 171 */ "\033$)G" }, { "CNS11643.1986-2:GL", /* CNS 11643-1992 pl.2 172 */ "\033$(H" }, { "CNS11643.1986-2:GR", /* CNS 11643-1992 pl.2 172 */ "\033$)H" }, { "CNS11643.1992-3:GL", /* CNS 11643-1992 pl.3 183 */ "\033$(I" }, { "CNS11643.1992-3:GR", /* CNS 11643-1992 pl.3 183 */ "\033$)I" }, { "CNS11643.1992-4:GL", /* CNS 11643-1992 pl.4 184 */ "\033$(J" }, { "CNS11643.1992-4:GR", /* CNS 11643-1992 pl.4 184 */ "\033$)J" }, { "CNS11643.1992-5:GL", /* CNS 11643-1992 pl.5 185 */ "\033$(K" }, { "CNS11643.1992-5:GR", /* CNS 11643-1992 pl.5 185 */ "\033$)K" }, { "CNS11643.1992-6:GL", /* CNS 11643-1992 pl.6 186 */ "\033$(L" }, { "CNS11643.1992-6:GR", /* CNS 11643-1992 pl.6 186 */ "\033$)L" }, { "CNS11643.1992-7:GL", /* CNS 11643-1992 pl.7 187 */ "\033$(M" }, { "CNS11643.1992-7:GR", /* CNS 11643-1992 pl.7 187 */ "\033$)M" }, /* Registered encodings with a varying number of bytes per character */ { "ISO10646-1", /* UTF-8 196 */ "\033%G" }, /* Encodings without ISO-IR assigned escape sequence must be defined in XLC_LOCALE files, using "\033%/1" or "\033%/2". */ /* Backward compatibility with XFree86 3.x */ #if 1 { "ISO8859-14:GR", "\033%/1" }, { "ISO8859-15:GR", "\033%/1" }, #endif /* For use by utf8 -> ctext */ { "BIG5-0:GLGR", "\033%/2"}, /* used by Emacs, but not backed by ISO-IR */ { "BIG5-E0:GL", "\033$(0" }, { "BIG5-E0:GR", "\033$)0" }, { "BIG5-E1:GL", "\033$(1" }, { "BIG5-E1:GR", "\033$)1" }, }; /* We represent UTF-8 as an XlcGLGR charset, not in extended segments. */ #define UTF8_IN_EXTSEQ 0 /* ======================= Parsing ESC Sequences ======================= */ #define XctC0 0x0000 #define XctHT 0x0009 #define XctNL 0x000a #define XctESC 0x001b #define XctGL 0x0020 #define XctC1 0x0080 #define XctCSI 0x009b #define XctGR 0x00a0 #define XctSTX 0x0002 #define XctCntrlFunc 0x0023 #define XctMB 0x0024 #define XctOtherCoding 0x0025 #define XctGL94 0x0028 #define XctGR94 0x0029 #define XctGR96 0x002d #define XctNonStandard 0x002f #define XctIgnoreExt 0x0030 #define XctNotIgnoreExt 0x0031 #define XctLeftToRight 0x0031 #define XctRightToLeft 0x0032 #define XctDirection 0x005d #define XctDirectionEnd 0x005d #define XctGL94MB 0x2428 #define XctGR94MB 0x2429 #define XctExtSeg 0x252f #define XctReturn 0x2540 /* * Parses the header of a Compound Text segment, i.e. the charset designator. * The string starts at *text and has *length bytes. * Return value is one of: * 0 (no valid charset designator), * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, * XctLeftToRight, XctRightToLeft, XctDirectionEnd, * XctExtSeg, XctOtherCoding, XctReturn, XctIgnoreExt, XctNotIgnoreExt. * If the return value is not 0, *text is incremented and *length decremented, * to point past the charset designator. If the return value is one of * XctGL94, XctGR94, XctGR96, XctGL94MB, XctGR94MB, * XctExtSeg, XctOtherCoding, XctIgnoreExt, XctNotIgnoreExt, * *final_byte is set to the "final byte" of the charset designator. */ static unsigned int _XlcParseCT( const char **text, int *length, unsigned char *final_byte) { unsigned int ret = 0; unsigned char ch; const unsigned char *str = (const unsigned char *) *text; *final_byte = 0; if (*length < 1) return 0; switch (ch = *str++) { case XctESC: if (*length < 2) return 0; switch (ch = *str++) { case XctOtherCoding: /* % */ if (*length < 3) return 0; ch = *str++; if (ch == XctNonStandard) { /* / */ if (*length < 4) return 0; ret = XctExtSeg; ch = *str++; } else if (ch == '@') { ret = XctReturn; } else { ret = XctOtherCoding; } *final_byte = ch; break; case XctCntrlFunc: /* # */ if (*length < 4) return 0; *final_byte = *str++; switch (*str++) { case XctIgnoreExt: /* 0 */ ret = XctIgnoreExt; break; case XctNotIgnoreExt: /* 1 */ ret = XctNotIgnoreExt; break; default: ret = 0; break; } break; case XctMB: /* $ */ if (*length < 4) return 0; ch = *str++; switch (ch) { case XctGL94: /* ( */ ret = XctGL94MB; break; case XctGR94: /* ) */ ret = XctGR94MB; break; default: ret = 0; break; } *final_byte = *str++; break; case XctGL94: /* ( */ if (*length < 3) return 0; ret = XctGL94; *final_byte = *str++; break; case XctGR94: /* ) */ if (*length < 3) return 0; ret = XctGR94; *final_byte = *str++; break; case XctGR96: /* - */ if (*length < 3) return 0; ret = XctGR96; *final_byte = *str++; break; } break; case XctCSI: /* direction */ if (*length < 2) return 0; switch (*str++) { case XctLeftToRight: if (*length < 3) return 0; if (*str++ == XctDirection) ret = XctLeftToRight; break; case XctRightToLeft: if (*length < 3) return 0; if (*str++ == XctDirection) ret = XctRightToLeft; break; case XctDirectionEnd: ret = XctDirectionEnd; break; } break; } if (ret) { *length -= (const char *) str - *text; *text = (const char *) str; } return ret; } /* * Fills into a freshly created XlcCharSet the fields that can be inferred * from the ESC sequence. These are side, char_size, set_size. * Returns True if the charset can be used with Compound Text. * * Used by _XlcCreateDefaultCharSet. */ Bool _XlcParseCharSet( XlcCharSet charset) { unsigned int type; unsigned char final_byte; const char *ptr = charset->ct_sequence; int length; int char_size; if (*ptr == '\0') return False; length = strlen(ptr); type = _XlcParseCT(&ptr, &length, &final_byte); /* Check for validity and determine char_size. char_size = 0 means varying number of bytes per character. */ switch (type) { case XctGL94: case XctGR94: case XctGR96: char_size = 1; break; case XctGL94MB: case XctGR94MB: char_size = (final_byte < 0x60 ? 2 : final_byte < 0x70 ? 3 : 4); break; case XctExtSeg: char_size = final_byte - '0'; if (!(char_size >= 0 && char_size <= 4)) return False; break; case XctOtherCoding: char_size = 0; break; default: return False; } charset->char_size = char_size; /* Fill in other values. */ switch (type) { case XctGL94: case XctGL94MB: charset->side = XlcGL; charset->set_size = 94; break; case XctGR94: case XctGR94MB: charset->side = XlcGR; charset->set_size = 94; break; case XctGR96: charset->side = XlcGR; charset->set_size = 96; break; case XctExtSeg: case XctOtherCoding: charset->side = XlcGLGR; charset->set_size = 0; break; } return True; } /* =============== Management of the List of Character Sets =============== */ /* * Representation of a character set that can be used for Compound Text, * at run time. * Note: This information is not contained in the XlcCharSet, because * multiple ESC sequences may be used for the same XlcCharSet. */ typedef struct _CTInfoRec { XlcCharSet charset; const char *ct_sequence; /* Compound Text ESC sequence */ unsigned int type; unsigned char final_byte; /* If type == XctExtSeg: */ const char *ext_segment; /* extended segment name, then '\002' */ int ext_segment_len; /* length of above, including final '\002' */ struct _CTInfoRec *next; } CTInfoRec, *CTInfo; /* * List of character sets that can be used for Compound Text, * Includes all that are listed in default_ct_data, but more can be added * at runtime through _XlcAddCT. */ static CTInfo ct_list = NULL; static CTInfo ct_list_end = NULL; /* * Returns a Compound Text info record for an ESC sequence. * The first part of the ESC sequence has already been parsed into 'type' * and 'final_byte'. The remainder starts at 'text', at least 'text_len' * bytes (only used if type == XctExtSeg). */ static CTInfo _XlcGetCTInfo( unsigned int type, unsigned char final_byte, const char *text, int text_len) { CTInfo ct_info; for (ct_info = ct_list; ct_info; ct_info = ct_info->next) if (ct_info->type == type && ct_info->final_byte == final_byte && (type != XctExtSeg || (text_len >= ct_info->ext_segment_len && memcmp(text, ct_info->ext_segment, ct_info->ext_segment_len) == 0))) return ct_info; return (CTInfo) NULL; } /* Returns the Compound Text info for a given XlcCharSet. Returns NULL if none is found. */ static CTInfo _XlcGetCTInfoFromCharSet( XlcCharSet charset) { CTInfo ct_info; for (ct_info = ct_list; ct_info; ct_info = ct_info->next) if (ct_info->charset == charset) return ct_info; return (CTInfo) NULL; } /* Creates a new XlcCharSet, given its name (including side suffix) and Compound Text ESC sequence (normally at most 4 bytes), and makes it eligible for Compound Text processing. */ XlcCharSet _XlcAddCT( const char *name, const char *ct_sequence) { CTInfo ct_info, existing_info; XlcCharSet charset; const char *ct_ptr; int length; unsigned int type; unsigned char final_byte; charset = _XlcGetCharSet(name); if (charset != NULL) { /* Even if the charset already exists, it is OK to register a second Compound Text sequence for it. */ } else { /* Attempt to create the charset. */ charset = _XlcCreateDefaultCharSet(name, ct_sequence); if (charset == NULL) return (XlcCharSet) NULL; _XlcAddCharSet(charset); } /* Allocate a CTinfo record. */ length = strlen(ct_sequence); ct_info = (CTInfo) Xmalloc(sizeof(CTInfoRec) + length+1); if (ct_info == NULL) return charset; ct_info->charset = charset; ct_info->ct_sequence = strcpy((char *) (ct_info + 1), ct_sequence); /* Parse the Compound Text sequence. */ ct_ptr = ct_sequence; type = _XlcParseCT(&ct_ptr, &length, &final_byte); ct_info->type = type; ct_info->final_byte = final_byte; switch (type) { case XctGL94: case XctGR94: case XctGR96: case XctGL94MB: case XctGR94MB: case XctOtherCoding: ct_info->ext_segment = NULL; ct_info->ext_segment_len = 0; break; case XctExtSeg: { /* By convention, the extended segment name is the encoding_name in lowercase. */ const char *q = charset->encoding_name; int n = strlen(q); char *p; /* Ensure ct_info->ext_segment_len <= 0x3fff - 6. */ if (n > 0x3fff - 6 - 1) { Xfree(ct_info); return charset; } p = (char *) Xmalloc(n+1); if (p == NULL) { Xfree(ct_info); return charset; } ct_info->ext_segment = p; ct_info->ext_segment_len = n+1; for ( ; n > 0; p++, q++, n--) *p = (*q >= 'A' && *q <= 'Z' ? *q - 'A' + 'a' : *q); *p = XctSTX; break; } default: Xfree(ct_info); return (XlcCharSet) NULL; } /* Insert it into the list, if not already present. */ existing_info = _XlcGetCTInfo(type, ct_info->final_byte, ct_info->ext_segment, ct_info->ext_segment_len); if (existing_info == NULL) { /* Insert it at the end. If there are duplicates CTinfo entries for the same XlcCharSet, we want the first (standard) one to override the second (user defined) one. */ ct_info->next = NULL; if (ct_list_end) ct_list_end->next = ct_info; else ct_list = ct_info; ct_list_end = ct_info; } else { if (existing_info->charset != charset /* We have a conflict, with one exception: JISX0208.1983-0 and JISX0208.1990-0 are the same for all practical purposes. */ && !(strncmp(existing_info->charset->name, "JISX0208", 8) == 0 && strncmp(charset->name, "JISX0208", 8) == 0)) { fprintf(stderr, "Xlib: charsets %s and %s have the same CT sequence\n", charset->name, existing_info->charset->name); if (strcmp(charset->ct_sequence, ct_sequence) == 0) charset->ct_sequence = ""; } Xfree(ct_info); } return charset; } /* ========== Converters String <--> CharSet <--> Compound Text ========== */ /* * Structure representing the parse state of a Compound Text string. */ typedef struct _StateRec { XlcCharSet charset; /* The charset of the current segment */ XlcCharSet GL_charset; /* The charset responsible for 0x00..0x7F */ XlcCharSet GR_charset; /* The charset responsible for 0x80..0xFF */ XlcCharSet Other_charset; /* != NULL if currently in an other segment */ int ext_seg_left; /* > 0 if currently in an extended segment */ } StateRec, *State; /* Subroutine for parsing an ESC sequence. */ typedef enum { resOK, /* Charset saved in 'state', sequence skipped */ resNotInList, /* Charset not found, sequence skipped */ resNotCTSeq /* EscSeq not recognized, pointers not changed */ } CheckResult; static CheckResult _XlcCheckCTSequence( State state, const char **ctext, int *ctext_len) { XlcCharSet charset; CTInfo ct_info; const char *tmp_ctext = *ctext; int tmp_ctext_len = *ctext_len; unsigned int type; unsigned char final_byte; int ext_seg_left = 0; /* Check for validity. */ type = _XlcParseCT(&tmp_ctext, &tmp_ctext_len, &final_byte); switch (type) { case XctGL94: case XctGR94: case XctGR96: case XctGL94MB: case XctGR94MB: case XctOtherCoding: *ctext = tmp_ctext; *ctext_len = tmp_ctext_len; break; case XctReturn: *ctext = tmp_ctext; *ctext_len = tmp_ctext_len; state->Other_charset = NULL; return resOK; case XctExtSeg: if (tmp_ctext_len > 2 && (tmp_ctext[0] & 0x80) && (tmp_ctext[0] & 0x80)) { unsigned int msb = tmp_ctext[0] & 0x7f; unsigned int lsb = tmp_ctext[1] & 0x7f; ext_seg_left = (msb << 7) + lsb; if (ext_seg_left <= tmp_ctext_len - 2) { *ctext = tmp_ctext + 2; *ctext_len = tmp_ctext_len - 2; break; } } return resNotCTSeq; default: return resNotCTSeq; } ct_info = _XlcGetCTInfo(type, final_byte, *ctext, ext_seg_left); if (ct_info) { charset = ct_info->charset; state->ext_seg_left = ext_seg_left; if (type == XctExtSeg) { state->charset = charset; /* Skip past the extended segment name and the separator. */ *ctext += ct_info->ext_segment_len; *ctext_len -= ct_info->ext_segment_len; state->ext_seg_left -= ct_info->ext_segment_len; } else if (type == XctOtherCoding) { state->Other_charset = charset; } else { if (charset->side == XlcGL) { state->GL_charset = charset; } else if (charset->side == XlcGR) { state->GR_charset = charset; } else { state->GL_charset = charset; state->GR_charset = charset; } } return resOK; } else { state->ext_seg_left = 0; if (type == XctExtSeg) { /* Skip the entire extended segment. */ *ctext += ext_seg_left; *ctext_len -= ext_seg_left; } return resNotInList; } } static void init_state( XlcConv conv) { State state = (State) conv->state; static XlcCharSet default_GL_charset = NULL; static XlcCharSet default_GR_charset = NULL; if (default_GL_charset == NULL) { default_GL_charset = _XlcGetCharSet("ISO8859-1:GL"); default_GR_charset = _XlcGetCharSet("ISO8859-1:GR"); } /* The initial state is ISO-8859-1 on both sides. */ state->GL_charset = state->charset = default_GL_charset; state->GR_charset = default_GR_charset; state->Other_charset = NULL; state->ext_seg_left = 0; } /* from XlcNCompoundText to XlcNCharSet */ static int cttocs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { State state = (State) conv->state; XlcCharSet charset = NULL; const char *ctptr; char *bufptr; int ctext_len, buf_len; int unconv_num = 0; ctptr = (const char *) *from; bufptr = (char *) *to; ctext_len = *from_left; buf_len = *to_left; while (ctext_len > 0 && buf_len > 0) { if (state->ext_seg_left == 0) { /* Not in the middle of an extended segment; look at next byte. */ unsigned char ch = *ctptr; XlcCharSet ch_charset; if (ch == XctESC) { CheckResult ret = _XlcCheckCTSequence(state, &ctptr, &ctext_len); if (ret == resOK) /* state has been modified. */ continue; if (ret == resNotInList) { /* XXX Just continue with previous charset. */ unconv_num++; continue; } } else if (ch == XctCSI) { /* XXX Simply ignore the XctLeftToRight, XctRightToLeft, XctDirectionEnd sequences for the moment. */ unsigned char dummy; if (_XlcParseCT(&ctptr, &ctext_len, &dummy)) { unconv_num++; continue; } } /* Find the charset which is responsible for this byte. */ ch_charset = (state->Other_charset != NULL ? state->Other_charset : (ch & 0x80 ? state->GR_charset : state->GL_charset)); /* Set the charset of this run, or continue the current run, or stop the current run. */ if (charset) { if (charset != ch_charset) break; } else { state->charset = charset = ch_charset; } /* We don't want to split a character into multiple pieces. */ if (buf_len < 6) { if (charset->char_size > 0) { if (buf_len < charset->char_size) break; } else { /* char_size == 0 is tricky. The code here is good only for valid UTF-8 input. */ if (charset->ct_sequence[0] == XctESC && charset->ct_sequence[1] == XctOtherCoding && charset->ct_sequence[2] == 'G') { int char_size = (ch < 0xc0 ? 1 : ch < 0xe0 ? 2 : ch < 0xf0 ? 3 : ch < 0xf8 ? 4 : ch < 0xfc ? 5 : 6); if (buf_len < char_size) break; } } } *bufptr++ = *ctptr++; ctext_len--; buf_len--; } else { /* Copy as much as possible from the current extended segment to the buffer. */ int char_size; /* Set the charset of this run, or continue the current run, or stop the current run. */ if (charset) { if (charset != state->charset) break; } else { charset = state->charset; } char_size = charset->char_size; if (state->ext_seg_left <= buf_len || char_size > 0) { int n = (state->ext_seg_left <= buf_len ? state->ext_seg_left : (buf_len / char_size) * char_size); memcpy(bufptr, ctptr, n); ctptr += n; ctext_len -= n; bufptr += n; buf_len -= n; state->ext_seg_left -= n; } else { #if UTF8_IN_EXTSEQ /* char_size == 0 is tricky. The code here is good only for valid UTF-8 input. */ if (strcmp(charset->name, "ISO10646-1") == 0) { unsigned char ch = *ctptr; int char_size = (ch < 0xc0 ? 1 : ch < 0xe0 ? 2 : ch < 0xf0 ? 3 : ch < 0xf8 ? 4 : ch < 0xfc ? 5 : 6); int i; if (buf_len < char_size) break; /* A small loop is faster than calling memcpy. */ for (i = char_size; i > 0; i--) *bufptr++ = *ctptr++; ctext_len -= char_size; buf_len -= char_size; state->ext_seg_left -= char_size; } else #endif { /* Here ctext_len >= state->ext_seg_left > buf_len. We may be splitting a character into multiple pieces. Oh well. */ int n = buf_len; memcpy(bufptr, ctptr, n); ctptr += n; ctext_len -= n; bufptr += n; buf_len -= n; state->ext_seg_left -= n; } } } } /* 'charset' is the charset for the current run. In some cases, 'state->charset' contains the charset for the next run. Therefore, return 'charset'. 'charset' may still be NULL only if no output was produced. */ if (num_args > 0) *((XlcCharSet *) args[0]) = charset; *from_left -= ctptr - *((const char **) from); *from = (XPointer) ctptr; *to_left -= bufptr - *((char **) to); *to = (XPointer) bufptr; return unconv_num; } /* from XlcNCharSet to XlcNCompoundText */ static int cstoct( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { State state = (State) conv->state; XlcSide side; unsigned char min_ch = 0, max_ch = 0; int length, unconv_num; CTInfo ct_info; XlcCharSet charset; const char *csptr; char *ctptr; int csstr_len, ct_len; char *ext_segment_start; int char_size; /* One argument is required, of type XlcCharSet. */ if (num_args < 1) return -1; csptr = *((const char **) from); ctptr = *((char **) to); csstr_len = *from_left; ct_len = *to_left; charset = (XlcCharSet) args[0]; ct_info = _XlcGetCTInfoFromCharSet(charset); if (ct_info == NULL) return -1; side = charset->side; length = strlen(ct_info->ct_sequence); ext_segment_start = NULL; if (ct_info->type == XctOtherCoding) { /* Output the Escape sequence for switching to the charset, and reserve room now for the XctReturn sequence at the end. */ if (ct_len < length + 3) return -1; memcpy(ctptr, ct_info->ct_sequence, length); ctptr += length; ct_len -= length + 3; } else /* Test whether the charset is already active. */ if (((side == XlcGR || side == XlcGLGR) && charset != state->GR_charset) || ((side == XlcGL || side == XlcGLGR) && charset != state->GL_charset)) { /* Output the Escape sequence for switching to the charset. */ if (ct_info->type == XctExtSeg) { if (ct_len < length + 2 + ct_info->ext_segment_len) return -1; memcpy(ctptr, ct_info->ct_sequence, length); ctptr += length; ct_len -= length; ctptr += 2; ct_len -= 2; ext_segment_start = ctptr; /* The size of an extended segment must fit in 14 bits. */ if (ct_len > 0x3fff) ct_len = 0x3fff; memcpy(ctptr, ct_info->ext_segment, ct_info->ext_segment_len); ctptr += ct_info->ext_segment_len; ct_len -= ct_info->ext_segment_len; } else { if (ct_len < length) return -1; memcpy(ctptr, ct_info->ct_sequence, length); ctptr += length; ct_len -= length; } } /* If the charset has side GL or GR, prepare remapping the characters to the correct side. */ if (charset->set_size) { min_ch = 0x20; max_ch = 0x7f; if (charset->set_size == 94) { max_ch--; if (charset->char_size > 1 || side == XlcGR) min_ch++; } } /* Actually copy the contents. */ unconv_num = 0; char_size = charset->char_size; if (char_size == 1) { while (csstr_len > 0 && ct_len > 0) { if (charset->set_size) { /* The CompoundText specification says that the only control characters allowed are 0x09, 0x0a, 0x1b, 0x9b. Therefore here we eliminate other control characters. */ unsigned char ch = *((unsigned char *) csptr) & 0x7f; if (!((ch >= min_ch && ch <= max_ch) || (side == XlcGL && (ch == 0x00 || ch == 0x09 || ch == 0x0a)) || ((side == XlcGL || side == XlcGR) && (ch == 0x1b)))) { csptr++; csstr_len--; unconv_num++; continue; } } if (side == XlcGL) *ctptr++ = *csptr++ & 0x7f; else if (side == XlcGR) *ctptr++ = *csptr++ | 0x80; else *ctptr++ = *csptr++; csstr_len--; ct_len--; } } else if (char_size > 1) { while (csstr_len >= char_size && ct_len >= char_size) { if (side == XlcGL) { int i; for (i = char_size; i > 0; i--) *ctptr++ = *csptr++ & 0x7f; } else if (side == XlcGR) { int i; for (i = char_size; i > 0; i--) *ctptr++ = *csptr++ | 0x80; } else { int i; for (i = char_size; i > 0; i--) *ctptr++ = *csptr++; } csstr_len -= char_size; ct_len -= char_size; } } else { /* char_size = 0. The code here is good only for valid UTF-8 input. */ if ((charset->ct_sequence[0] == XctESC && charset->ct_sequence[1] == XctOtherCoding && charset->ct_sequence[2] == 'G') #if UTF8_IN_EXTSEQ || strcmp(charset->name, "ISO10646-1") == 0 #endif ) { while (csstr_len > 0 && ct_len > 0) { unsigned char ch = * (unsigned char *) csptr; int char_size = (ch < 0xc0 ? 1 : ch < 0xe0 ? 2 : ch < 0xf0 ? 3 : ch < 0xf8 ? 4 : ch < 0xfc ? 5 : 6); int i; if (!(csstr_len >= char_size && ct_len >= char_size)) break; for (i = char_size; i > 0; i--) *ctptr++ = *csptr++; csstr_len -= char_size; ct_len -= char_size; } } else { while (csstr_len > 0 && ct_len > 0) { *ctptr++ = *csptr++; csstr_len--; ct_len--; } } } if (ct_info->type == XctOtherCoding) { /* Terminate with an XctReturn sequence. */ ctptr[0] = XctESC; ctptr[1] = XctOtherCoding; ctptr[2] = '@'; ctptr += 3; } else if (ext_segment_start != NULL) { /* Backpatch the extended segment's length. */ int ext_segment_length = ctptr - ext_segment_start; *(ext_segment_start - 2) = (ext_segment_length >> 7) | 0x80; *(ext_segment_start - 1) = (ext_segment_length & 0x7f) | 0x80; } else { if (side == XlcGR || side == XlcGLGR) state->GR_charset = charset; if (side == XlcGL || side == XlcGLGR) state->GL_charset = charset; } *from_left -= csptr - *((const char **) from); *from = (XPointer) csptr; *to_left -= ctptr - *((char **) to); *to = (XPointer) ctptr; return 0; } /* from XlcNString to XlcNCharSet */ static int strtocs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { State state = (State) conv->state; const char *src; char *dst; unsigned char side; int length; src = (const char *) *from; dst = (char *) *to; length = min(*from_left, *to_left); side = *((unsigned char *) src) & 0x80; while (side == (*((unsigned char *) src) & 0x80) && length-- > 0) *dst++ = *src++; *from_left -= src - (const char *) *from; *from = (XPointer) src; *to_left -= dst - (char *) *to; *to = (XPointer) dst; if (num_args > 0) *((XlcCharSet *)args[0]) = (side ? state->GR_charset : state->GL_charset); return 0; } /* from XlcNCharSet to XlcNString */ static int cstostr( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { State state = (State) conv->state; const char *csptr; char *string_ptr; int csstr_len, str_len; unsigned char ch; int unconv_num = 0; /* This converter can only convert from ISO8859-1:GL and ISO8859-1:GR. */ if (num_args < 1 || !((XlcCharSet) args[0] == state->GL_charset || (XlcCharSet) args[0] == state->GR_charset)) return -1; csptr = *((const char **) from); string_ptr = *((char **) to); csstr_len = *from_left; str_len = *to_left; while (csstr_len > 0 && str_len > 0) { ch = *((unsigned char *) csptr++); csstr_len--; /* Citing ICCCM: "STRING as a type specifies the ISO Latin-1 character set plus the control characters TAB and NEWLINE." */ if ((ch < 0x20 && ch != 0x00 && ch != 0x09 && ch != 0x0a) || (ch >= 0x7f && ch < 0xa0)) { unconv_num++; continue; } *((unsigned char *) string_ptr++) = ch; str_len--; } *from_left -= csptr - *((const char **) from); *from = (XPointer) csptr; *to_left -= string_ptr - *((char **) to); *to = (XPointer) string_ptr; return unconv_num; } static XlcConv create_conv( XlcConvMethods methods) { XlcConv conv; conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + sizeof(StateRec)); if (conv == NULL) return (XlcConv) NULL; conv->state = (XPointer) &conv[1]; conv->methods = methods; init_state(conv); return conv; } static void close_converter( XlcConv conv) { /* conv->state is allocated together with conv, free both at once. */ Xfree((char *) conv); } static XlcConvMethodsRec cttocs_methods = { close_converter, cttocs, init_state }; static XlcConv open_cttocs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(&cttocs_methods); } static XlcConvMethodsRec cstoct_methods = { close_converter, cstoct, init_state }; static XlcConv open_cstoct( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(&cstoct_methods); } static XlcConvMethodsRec strtocs_methods = { close_converter, strtocs, init_state }; static XlcConv open_strtocs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(&strtocs_methods); } static XlcConvMethodsRec cstostr_methods = { close_converter, cstostr, init_state }; static XlcConv open_cstostr( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(&cstostr_methods); } /* =========================== Initialization =========================== */ Bool _XlcInitCTInfo() { if (ct_list == NULL) { CTData ct_data; int num; XlcCharSet charset; /* Initialize ct_list. */ num = sizeof(default_ct_data) / sizeof(CTDataRec); for (ct_data = default_ct_data; num > 0; ct_data++, num--) { charset = _XlcAddCT(ct_data->name, ct_data->ct_sequence); if (charset == NULL) continue; if (strncmp(charset->ct_sequence, "\x1b\x25\x2f", 3) != 0) charset->source = CSsrcStd; else charset->source = CSsrcXLC; } /* Register CompoundText and CharSet converters. */ _XlcSetConverter((XLCd) NULL, XlcNCompoundText, (XLCd) NULL, XlcNCharSet, open_cttocs); _XlcSetConverter((XLCd) NULL, XlcNString, (XLCd) NULL, XlcNCharSet, open_strtocs); _XlcSetConverter((XLCd) NULL, XlcNCharSet, (XLCd) NULL, XlcNCompoundText, open_cstoct); _XlcSetConverter((XLCd) NULL, XlcNCharSet, (XLCd) NULL, XlcNString, open_cstostr); } return True; }