From 6456295d747373150577e02588770a1a4ba1efaf Mon Sep 17 00:00:00 2001 From: Ken Hornstein Date: Sat, 26 May 2012 01:55:08 -0400 Subject: [PATCH] Support for handling multibyte encodings in cpstripped(), which means that multibyte character encodings should work correctly on all systems which support the POSIX wide character functions. --- Makefile.am | 3 +- configure.ac | 14 +++++--- sbr/fmt_scan.c | 79 +++++++++++++++++++++++++++-------------- test/common.sh.in | 2 ++ test/scan/test-scan-multibyte | 55 ++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 32 deletions(-) create mode 100755 test/scan/test-scan-multibyte diff --git a/Makefile.am b/Makefile.am index ad6b1e6..2c4dcfe 100644 --- a/Makefile.am +++ b/Makefile.am @@ -37,6 +37,7 @@ TESTS_ENVIRONMENT = MH_OBJ_DIR=@abs_builddir@ \ MH_TEST_DIR=@abs_builddir@/test/testdir \ auxexecdir=$(auxexecdir) bindir=$(bindir) \ mandir=$(mandir) sysconfdir=$(sysconfdir) \ + MULTIBYTE_ENABLED=$(MULTIBYTE_ENABLED) \ $(TESTS_SHELL) ## Keep at end of TESTS_ENVIRONMENT. ## ## Important note: the "cleanup" test should always be last @@ -66,7 +67,7 @@ TESTS = test/bad-input/test-header \ test/refile/test-refile \ test/repl/test-if-str test/repl/test-trailing-newline \ test/repl/test-multicomp \ - test/scan/test-scan \ + test/scan/test-scan test/scan/test-scan-multibyte \ test/sequences/test-flist test/sequences/test-mark \ test/whatnow/test-attach-detach test/whatnow/test-cd \ test/whatnow/test-ls test/whom/test-whom \ diff --git a/configure.ac b/configure.ac index ee79a0a..daec29c 100644 --- a/configure.ac +++ b/configure.ac @@ -441,11 +441,15 @@ dnl --------------- AC_CHECK_FUNCS([wcwidth mbtowc writev lstat nl_langinfo getutxent]) dnl Check for multibyte character set support -if test "x$ac_cv_header_wchar_h" = "xyes" -a "x$ac_cv_header_wctype_h" = "xyes" \ - -a "x$ac_cv_func_wcwidth" = "xyes" -a "x$ac_cv_func_mbtowc" = "xyes"; then - AC_DEFINE(MULTIBYTE_SUPPORT, 1, - [Define to enable support for multibyte character sets.]) -fi +AS_IF([test "x$ac_cv_header_wchar_h" = "xyes" -a \ + "x$ac_cv_header_wctype_h" = "xyes" -a \ + "x$ac_cv_func_wcwidth" = "xyes" -a \ + "x$ac_cv_func_mbtowc" = "xyes"], + [AC_DEFINE([MULTIBYTE_SUPPORT], [1], + [Define to enable support for multibyte character sets.]) + MULTIBYTE_ENABLED=1], + [MULTIBYTE_ENABLED=0]) +AC_SUBST([MULTIBYTE_ENABLED]) dnl ------------------- dnl CHECK FOR LIBRARIES diff --git a/sbr/fmt_scan.c b/sbr/fmt_scan.c index 3f004e5..74cfd92 100644 --- a/sbr/fmt_scan.c +++ b/sbr/fmt_scan.c @@ -208,37 +208,64 @@ cptrimmed(char **dest, char *str, unsigned int wid, char fill, size_t n) { } static void -cpstripped (char **start, char *end, char *str) +cpstripped (char **dest, char *end, char *str) { - int c; - char *s = str; + int prevCtrl = 1; /* This is 1 so we strip out leading spaces */ + int len; +#ifdef MULTIBYTE_SUPPORT + int char_len; + wchar_t wide_char; +#endif /* MULTIBYTE_SUPPORT */ - if (!s) + if (!str) return; - /* skip any initial control characters or spaces */ - while ((c = (unsigned char) *s) && -#ifdef LOCALE - (iscntrl(c) || isspace(c))) -#else - (c <= 32)) -#endif - s++; - - /* compact repeated control characters and spaces into a single space */ - while((c = (unsigned char) *s++) && *start < end) - if (!iscntrl(c) && !isspace(c)) - *(*start)++ = c; - else { - while ((c = (unsigned char) *s) && -#ifdef LOCALE - (iscntrl(c) || isspace(c))) -#else - (c <= 32)) -#endif - s++; - *(*start)++ = ' '; + len = strlen(str); + +#ifdef MULTIBYTE_SUPPORT + mbtowc(NULL, NULL, 0); /* Reset shift state */ +#endif /* MULTIBYTE_SUPPORT */ + + /* + * Process each character at a time; if we have multibyte support + * then deal with that here. + */ + + while (*str != '\0' && len > 0 && *dest < end) { +#ifdef MULTIBYTE_SUPPORT + char_len = mbtowc(&wide_char, str, len); + + if (char_len <= 0 || *dest + char_len > end) + break; + + len -= char_len; + + if (iswcntrl(wide_char) || iswspace(wide_char)) { + str += char_len; +#else /* MULTIBYTE_SUPPORT */ + int c = *str; + len--; + if (iscntrl(c) || isspace(c)) { + str++; +#endif /* MULTIBYTE_SUPPORT */ + if (! prevCtrl) { + *(*dest)++ = ' '; + } + + prevCtrl = 1; + continue; } + + prevCtrl = 0; + +#ifdef MULTIBYTE_SUPPORT + memcpy(*dest, str, char_len); + str += char_len; + *dest += char_len; +#else /* MULTIBYE_SUPPORT */ + *(*dest)++ = *str++ +#endif /* MULTIBYTE_SUPPORT */ + } } static char *lmonth[] = { "January", "February","March", "April", diff --git a/test/common.sh.in b/test/common.sh.in index b1fd484..705e993 100644 --- a/test/common.sh.in +++ b/test/common.sh.in @@ -14,7 +14,9 @@ test -z "$bindir" && bindir="@bindir@" test -z "$mandir" && mandir="@mandir@" test -z "$sysconfdir" && sysconfdir="@sysconfdir@" test -z "$pagerpath" && pagerpath="@pagerpath@" +test -z "$MULTIBYTE_ENABLED" && MULTIBYTE_ENABLED="@MULTIBYTE_ENABLED@" export MH_TEST_DIR auxexecdir bindir mandir sysconfdir pagerpath +export MULTIBYTE_ENABLED test -z "$MH_INST_DIR" && MH_INST_DIR=${MH_TEST_DIR}/inst export MH_INST_DIR diff --git a/test/scan/test-scan-multibyte b/test/scan/test-scan-multibyte new file mode 100755 index 0000000..bd0da47 --- /dev/null +++ b/test/scan/test-scan-multibyte @@ -0,0 +1,55 @@ +#!/bin/sh +############################################################ +# +# Test scan to see if multibyte support (UTF-8 locale) works +# +# Other tests will get the normal ASCII case, so all we care +# about here is UTF-8 encoded headers (RFC 2047). +# +# Note that this file should be edited via a UTF-8 aware +# editor, since UTF-8 characters are in it. +# +############################################################ + +set -e + +if test -z "${MH_OBJ_DIR}"; then + srcdir=`dirname "$0"`/../.. + MH_OBJ_DIR=`cd "$srcdir" && pwd`; export MH_OBJ_DIR +fi + +. "$MH_OBJ_DIR/test/common.sh" + +setup_test + +if test "${MULTIBYTE_ENABLED}" -ne 1; then + test_skip "configure did not detect multibyte support" +fi + +export LC_ALL=en_US.UTF-8 + +# +# Create a test message with RFC 2047 headers we can scan +# + +cat > "${MH_TEST_DIR}/Mail/inbox/11" < +To: Sir Denis =?utf-8?q?Eton=E2=80=93Hogg? +Date: Friday, 2 Mar 1984 00:00:00 +Subject: =?utf-8?q?Spin=CC=88al_Tap_=E2=86=92_Tap_into_America!?= + +Things are looking great! +EOF + +expected=$MH_TEST_DIR/$$.expected +actual=$MH_TEST_DIR/$$.actual + +cat > $expected < $actual || exit 1 + +check "$expected" "$actual" + +exit $failed -- 1.7.10.4