From 8add15b86e7aaef41397ab8fa9e77ee7957eb607 Mon Sep 17 00:00:00 2001 From: Tim Serong Date: Sat, 2 May 2015 01:59:53 +1000 Subject: [PATCH] json_spirit: use utf8 intenally when parsing \uHHHH When the python CLI is given non-ASCII characters, it converts them to \uHHHH escapes in JSON. json_spirit parses these internally into 16 bit characters, which could only work if json_spirit were built to use std::wstring, which it isn't; it's using std::string, so the high byte ends up being zero'd, leaving the low byte which is effectively garbage. This hack^H^H^H^H change makes json_spirit convert to utf8 internally instead, which can be stored just fine inside a std::string. Note that this implementation still assumes \uHHHH escapes are four hex digits, so it'll only cope with characters in the Basic Multilingual Plane. Still, that's rather a lot more characters than it could cope with before ;) (For characters outside the BMP, Python seems to generate escapes in the form \uHHHHHHHH, i.e. 8 hex digits, which the current implementation doesn't expect to see) Fixes: #7387 Signed-off-by: Tim Serong --- src/json_spirit/json_spirit_reader_template.h | 34 ++++++++++++++----- src/test/mon/osd-pool-create.sh | 15 ++++++++ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/json_spirit/json_spirit_reader_template.h b/src/json_spirit/json_spirit_reader_template.h index f87b59331b7..2eaf743efae 100644 --- a/src/json_spirit/json_spirit_reader_template.h +++ b/src/json_spirit/json_spirit_reader_template.h @@ -13,6 +13,8 @@ #include "json_spirit_value.h" #include "json_spirit_error_position.h" +#include "common/utf8.h" + #define BOOST_SPIRIT_THREADSAFE // uncomment for multithreaded use, requires linking to boost.thread #include @@ -71,18 +73,30 @@ namespace json_spirit return ( hex_to_num( c1 ) << 4 ) + hex_to_num( c2 ); } - template< class Char_type, class Iter_type > - Char_type unicode_str_to_char( Iter_type& begin ) + template< class String_type, class Iter_type > + String_type unicode_str_to_utf8( Iter_type& begin ); + + template<> + std::string unicode_str_to_utf8( std::string::const_iterator & begin ) { + typedef typename std::string::value_type Char_type; + const Char_type c1( *( ++begin ) ); const Char_type c2( *( ++begin ) ); const Char_type c3( *( ++begin ) ); const Char_type c4( *( ++begin ) ); - return ( hex_to_num( c1 ) << 12 ) + - ( hex_to_num( c2 ) << 8 ) + - ( hex_to_num( c3 ) << 4 ) + - hex_to_num( c4 ); + unsigned long uc = ( hex_to_num( c1 ) << 12 ) + + ( hex_to_num( c2 ) << 8 ) + + ( hex_to_num( c3 ) << 4 ) + + hex_to_num( c4 ); + + unsigned char buf[7]; // MAX_UTF8_SZ is 6 (see src/common/utf8.c) + int r = encode_utf8(uc, buf); + if (r >= 0) { + return std::string(reinterpret_cast(buf), r); + } + return std::string("_"); } template< class String_type > @@ -116,7 +130,7 @@ namespace json_spirit { if( end - begin >= 5 ) // expecting "uHHHH..." { - s += unicode_str_to_char< Char_type >( begin ); + s += unicode_str_to_utf8< String_type >( begin ); } break; } @@ -178,11 +192,15 @@ namespace json_spirit return get_str_< std::string >( begin, end ); } +// Need this guard else it tries to instantiate unicode_str_to_utf8 with a +// std::wstring, which isn't presently implemented +#if defined( JSON_SPIRIT_WMVALUE_ENABLED ) && !defined( BOOST_NO_STD_WSTRING ) inline std::wstring get_str( std::wstring::const_iterator begin, std::wstring::const_iterator end ) { return get_str_< std::wstring >( begin, end ); } - +#endif + template< class String_type, class Iter_type > String_type get_str( Iter_type begin, Iter_type end ) { diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh index 428bfe06def..8a57856cd61 100755 --- a/src/test/mon/osd-pool-create.sh +++ b/src/test/mon/osd-pool-create.sh @@ -236,6 +236,21 @@ function TEST_no_pool_delete() { ./ceph osd pool delete foo foo --yes-i-really-really-mean-it } +function TEST_utf8_cli() { + local dir=$1 + run_mon $dir a --public-addr $CEPH_MON + # Hopefully it's safe to include literal UTF-8 characters to test + # the fix for http://tracker.ceph.com/issues/7387. If it turns out + # to not be OK (when is the default encoding *not* UTF-8?), maybe + # the character '黄' can be replaced with the escape $'\xe9\xbb\x84' + ./ceph osd pool create 黄 1024 2>&1 | \ + grep "pool '黄' created" || return 1 + ./ceph osd lspools 2>&1 | \ + grep "黄" || return 1 + ./ceph -f json-pretty osd dump | \ + python -c "import json; import sys; json.load(sys.stdin)" || return 1 + ./ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it +} main osd-pool-create