git.marmaro.de Git - mmh/blob - sbr/norm_charmap.c

   1 /*
   2  * The Single Unix Specification function nl_langinfo(CODESET)
   3  * returns the name of the encoding used by the currently selected
   4  * locale:
   5  *
   6  *   http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html
   7  *
   8  * Unfortunately the encoding names are not yet standardized.
   9  * This function knows about the encoding names used on many
  10  * different systems and converts them where possible into
  11  * the corresponding MIME charset name registered in
  12  *
  13  *   http://www.iana.org/assignments/character-sets
  14  *
  15  * Please extend it as needed and suggest improvements to the author.
  16  *
  17  * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11
  18  * Permission to use, copy, modify, and distribute this software
  19  * for any purpose and without fee is hereby granted. The author
  20  * disclaims all warranties with regard to this software.
  21  *
  22  * Latest version:
  23  *
  24  *   http://www.cl.cam.ac.uk/~mgk25/ucs/norm_charmap.c
  25  */
  26
  27 #include <string.h>
  28
  29 #define digit(x) ((x) >= '0' && (x) <= '9')
  30
  31 static char buf[16];
  32
  33 char *
  34 norm_charmap(char *name)
  35 {
  36   char *p;
  37
  38   if (!name)
  39     return name;
  40
  41   /* Many need no remapping, but they are listed here so you
  42    * can see what output to expect, and modify for your needs
  43    * as necessary. */
  44   if (!strcmp(name, "UTF-8"))
  45     return "UTF-8";
  46   if (!strcmp(name, "EUC-JP"))
  47     return "EUC-JP";
  48   if (!strcmp(name, "EUC-KR"))
  49     return "EUC-KR";
  50   if (!strcmp(name, "EUC-TW"))
  51     return "EUC-TW";
  52   if (!strcmp(name, "KOI8-R"))
  53     return "KOI8-R";
  54   if (!strcmp(name, "KOI8-U"))
  55     return "KOI8-U";
  56   if (!strcmp(name, "GBK"))
  57     return "GBK";
  58   if (!strcmp(name, "GB2312"))
  59     return "GB2312";
  60   if (!strcmp(name, "GB18030"))
  61     return "GB18030";
  62   if (!strcmp(name, "VSCII"))
  63     return "VSCII";
  64
  65   /* ASCII comes in many names */
  66   if (!strcmp(name, "ASCII") ||
  67       !strcmp(name, "US-ASCII") ||
  68       !strcmp(name, "ANSI_X3.4-1968") ||
  69       !strcmp(name, "646") ||
  70       !strcmp(name, "ISO646") ||
  71       !strcmp(name, "ISO_646.IRV"))
  72     return "US-ASCII";
  73
  74   /* ISO 8859 will be converted to "ISO-8859-x" */
  75   if ((p = strstr(name, "8859-"))) {
  76     memcpy(buf, "ISO-8859-\0\0", 12);
  77     p += 5;
  78     if (digit(*p)) {
  79       buf[9] = *p++;
  80       if (digit(*p)) buf[10] = *p++;
  81       return buf;
  82     }
  83   }
  84
  85   /* Windows code pages will be converted to "WINDOWS-12xx" */
  86   if ((p = strstr(name, "CP12"))) {
  87     memcpy(buf, "WINDOWS-12\0\0", 13);
  88     p += 4;
  89     if (digit(*p)) {
  90       buf[10] = *p++;
  91       if (digit(*p)) buf[11] = *p++;
  92       return buf;
  93     }
  94   }
  95
  96   /* TIS-620 comes in at least the following two forms */
  97   if (!strcmp(name, "TIS-620") ||
  98       !strcmp(name, "TIS620.2533"))
  99     return "ISO-8859-11";
 100
 101   /* For some, uppercase/lowercase might differ */
 102   if (!strcmp(name, "Big5") || !strcmp(name, "BIG5"))
 103     return "Big5";
 104   if (!strcmp(name, "Big5HKSCS") || !strcmp(name, "BIG5HKSCS"))
 105     return "Big5HKSCS";
 106
 107   /* I don't know of any implementation of nl_langinfo(CODESET) out
 108    * there that returns anything else (and I'm not even certain all of
 109    * the above occur in the wild), but just in case, as a fallback,
 110    * return the unmodified name. */
 111   return name;
 112 }