From: Oliver Kiddle Date: Thu, 27 Jan 2005 16:26:24 +0000 (+0000) Subject: On systems where it is available, use nl_langinfo to get the character X-Git-Tag: RELEASE_1_2~51 X-Git-Url: http://git.marmaro.de/?p=mmh;a=commitdiff_plain;h=adc954633908e3fc97dfa94c08f964e1bbb94086 On systems where it is available, use nl_langinfo to get the character set if MM_CHARSET is unset --- diff --git a/ChangeLog b/ChangeLog index 5594bfc..88e5bd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2005-01-27 Oliver Kiddle + + * On systems where it is available, use nl_langinfo to get the + character set if MM_CHARSET is unset + 2005-01-21 Oliver Kiddle * sbr/check_charset.c US-ASCII is a subset of UTF-8 so can be diff --git a/configure.in b/configure.in index a5e891c..85e5181 100644 --- a/configure.in +++ b/configure.in @@ -444,8 +444,8 @@ AC_HEADER_SYS_WAIT AC_HEADER_STAT AC_CHECK_HEADERS(string.h memory.h stdlib.h unistd.h errno.h fcntl.h \ limits.h crypt.h termcap.h termio.h termios.h locale.h \ - netdb.h sys/param.h sys/time.h sys/utsname.h arpa/inet.h \ - arpa/ftp.h) + langinfo.h netdb.h sys/param.h sys/time.h sys/utsname.h \ + arpa/inet.h arpa/ftp.h) AC_CACHE_CHECK(POSIX termios, nmh_cv_sys_posix_termios, @@ -499,7 +499,7 @@ AC_FUNC_VFORK AC_CHECK_LIB(mkstemp,mkstemp) AC_CHECK_FUNCS(waitpid wait3 sigaction sigprocmask sigblock sigsetmask \ sighold sigrelse writev lstat uname tzset killpg mkstemp \ - sethostent getutent) + sethostent getutent nl_langinfo) dnl solaris screws this up AC_CHECK_FUNC(gethostbyname, [AC_DEFINE(HAVE_GETHOSTBYNAME)], diff --git a/h/prototypes.h b/h/prototypes.h index fa9f0c2..c2a876f 100644 --- a/h/prototypes.h +++ b/h/prototypes.h @@ -83,6 +83,7 @@ char *m_tmpfil (char *); void m_unknown(FILE *); int makedir (char *); char *nmh_getpass(const char *); +char *norm_charmap(char *); char *new_fs (char *, char *, char *); char *path(char *, int); int peekc(FILE *ib); diff --git a/sbr/Makefile.in b/sbr/Makefile.in index 84b003f..317a177 100644 --- a/sbr/Makefile.in +++ b/sbr/Makefile.in @@ -65,7 +65,8 @@ SRCS = add.c addrsbr.c ambigsw.c atooi.c brkstring.c \ fmt_scan.c lock_file.c m_atoi.c m_backup.c \ m_convert.c m_draft.c m_getfld.c m_gmprot.c \ m_maildir.c m_name.c m_scratch.c m_tmpfil.c \ - makedir.c mts.c path.c peekc.c pidwait.c pidstatus.c \ + makedir.c mts.c norm_charmap.c \ + path.c peekc.c pidwait.c pidstatus.c \ print_help.c print_sw.c print_version.c push.c \ putenv.c pwd.c refile.c remdir.c r1bindex.c \ readconfig.c ruserpass.c seq_add.c seq_bits.c \ diff --git a/sbr/check_charset.c b/sbr/check_charset.c index f45448d..d6b8ca3 100644 --- a/sbr/check_charset.c +++ b/sbr/check_charset.c @@ -10,6 +10,25 @@ */ #include +#ifdef HAVE_LANGINFO_H +# include +#endif + + +/* + * Get the current character set + */ +char * +get_charset () +{ + char *charset = getenv ("MM_CHARSET"); +#if defined(HAVE_NL_LANGINFO) && defined(CODESET) + if (!charset) + charset = norm_charmap(nl_langinfo (CODESET)); +#endif + return charset; +} + /* * Check if we can display a given character set natively. @@ -28,7 +47,7 @@ check_charset (char *str, int len) /* Cache the name of our default character set */ if (!mm_charset) { - if (!(mm_charset = getenv ("MM_CHARSET"))) + if (!(mm_charset = get_charset ())) mm_charset = "US-ASCII"; mm_len = strlen (mm_charset); @@ -63,7 +82,7 @@ write_charset_8bit (void) * Cache the name of the character set to * use for 8bit text. */ - if (!mm_charset && !(mm_charset = getenv ("MM_CHARSET"))) + if (!mm_charset && !(mm_charset = get_charset ())) mm_charset = "x-unknown"; return mm_charset; diff --git a/sbr/norm_charmap.c b/sbr/norm_charmap.c new file mode 100644 index 0000000..ae81046 --- /dev/null +++ b/sbr/norm_charmap.c @@ -0,0 +1,112 @@ +/* + * The Single Unix Specification function nl_langinfo(CODESET) + * returns the name of the encoding used by the currently selected + * locale: + * + * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html + * + * Unfortunately the encoding names are not yet standardized. + * This function knows about the encoding names used on many + * different systems and converts them where possible into + * the corresponding MIME charset name registered in + * + * http://www.iana.org/assignments/character-sets + * + * Please extend it as needed and suggest improvements to the author. + * + * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11 + * Permission to use, copy, modify, and distribute this software + * for any purpose and without fee is hereby granted. The author + * disclaims all warranties with regard to this software. + * + * Latest version: + * + * http://www.cl.cam.ac.uk/~mgk25/ucs/norm_charmap.c + */ + +#include + +#define digit(x) ((x) >= '0' && (x) <= '9') + +static char buf[16]; + +char * +norm_charmap(char *name) +{ + char *p; + + if (!name) + return name; + + /* Many need no remapping, but they are listed here so you + * can see what output to expect, and modify for your needs + * as necessary. */ + if (!strcmp(name, "UTF-8")) + return "UTF-8"; + if (!strcmp(name, "EUC-JP")) + return "EUC-JP"; + if (!strcmp(name, "EUC-KR")) + return "EUC-KR"; + if (!strcmp(name, "EUC-TW")) + return "EUC-TW"; + if (!strcmp(name, "KOI8-R")) + return "KOI8-R"; + if (!strcmp(name, "KOI8-U")) + return "KOI8-U"; + if (!strcmp(name, "GBK")) + return "GBK"; + if (!strcmp(name, "GB2312")) + return "GB2312"; + if (!strcmp(name, "GB18030")) + return "GB18030"; + if (!strcmp(name, "VSCII")) + return "VSCII"; + + /* ASCII comes in many names */ + if (!strcmp(name, "ASCII") || + !strcmp(name, "US-ASCII") || + !strcmp(name, "ANSI_X3.4-1968") || + !strcmp(name, "646") || + !strcmp(name, "ISO646") || + !strcmp(name, "ISO_646.IRV")) + return "US-ASCII"; + + /* ISO 8859 will be converted to "ISO-8859-x" */ + if ((p = strstr(name, "8859-"))) { + memcpy(buf, "ISO-8859-\0\0", 12); + p += 5; + if (digit(*p)) { + buf[9] = *p++; + if (digit(*p)) buf[10] = *p++; + return buf; + } + } + + /* Windows code pages will be converted to "WINDOWS-12xx" */ + if ((p = strstr(name, "CP12"))) { + memcpy(buf, "WINDOWS-12\0\0", 13); + p += 4; + if (digit(*p)) { + buf[10] = *p++; + if (digit(*p)) buf[11] = *p++; + return buf; + } + } + + /* TIS-620 comes in at least the following two forms */ + if (!strcmp(name, "TIS-620") || + !strcmp(name, "TIS620.2533")) + return "ISO-8859-11"; + + /* For some, uppercase/lowercase might differ */ + if (!strcmp(name, "Big5") || !strcmp(name, "BIG5")) + return "Big5"; + if (!strcmp(name, "Big5HKSCS") || !strcmp(name, "BIG5HKSCS")) + return "Big5HKSCS"; + + /* I don't know of any implementation of nl_langinfo(CODESET) out + * there that returns anything else (and I'm not even certain all of + * the above occur in the wild), but just in case, as a fallback, + * return the unmodified name. */ + return name; +}