src/dictionary/user_dictionary_util.cc - mozc - Git at Google

 // Copyright 2010, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "dictionary/user_dictionary_util.h"

 #include <string.h>
 #include <algorithm>
 #include "base/base.h"
 #include "base/config_file_stream.h"
 #include "base/file_stream.h"
 #include "base/util.h"
 #include "converter/pos.h"

 namespace mozc {

 namespace {
 // Maximum string length in UserDictionaryEntry's field
 const size_t kMaxKeySize            =   300;
 const size_t kMaxValueSize          =   300;
 const size_t kMaxPOSSize            =   300;
 const size_t kMaxCommentSize        =   300;
 const char kInvalidChars[]= "\n\r\t";
 const char kUserDictionaryFile[] = "user://user_dictionary.db";
 }

 // TODO(keni): Write unit tests for this function.
 bool UserDictionaryUtil::IsValidEntry(
     const UserDictionaryStorage::UserDictionaryEntry &entry) {
   if (entry.key().empty()) {
     VLOG(1) << "key is empty";
     return false;
   }
   if (entry.key().find_first_of(kInvalidChars) != string::npos) {
     VLOG(1) << "Invalid character in key.";
     return false;
   }
   if (entry.key().size() > kMaxKeySize) {
     VLOG(1) << "Too long key.";
     return false;
   }
   if (entry.value().find_first_of(kInvalidChars) != string::npos) {
     VLOG(1) << "Invalid character in value.";
     return false;
   }
   if (entry.value().size() > kMaxValueSize) {
     VLOG(1) << "Too long value.";
     return false;
   }
   if (entry.pos().find_first_of(kInvalidChars) != string::npos) {
     VLOG(1) << "Invalid character in POS.";
     return false;
   }
   if (entry.pos().size() > kMaxPOSSize) {
     VLOG(1) << "Too long POS.";
     return false;
   }
   if (entry.comment().find_first_of(kInvalidChars) != string::npos) {
     VLOG(1) << "Invalid character in comment.";
     return false;
   }
   if (entry.comment().size() > kMaxCommentSize) {
     VLOG(1) << "Too long comment.";
     return false;
   }
   if (!UserDictionaryUtil::IsValidReading(entry.key())) {
     VLOG(1) << "Invalid reading";
     return false;
   }
   if (!POS::IsValidPOS(entry.pos())) {
     VLOG(1) << "Invalid POS";
     return false;
   }

   return true;
 }

 namespace {

 #define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b))

 bool InternalValidateNormalizedReading(const string &normalized_reading) {
   const char *begin = normalized_reading.c_str();
   const char *end = begin + normalized_reading.size();
   size_t mblen = 0;
   while (begin < end) {
     const uint16 w = Util::UTF8ToUCS2(begin, end, &mblen);
     if (INRANGE(w, 0x0021, 0x007E) ||  // Basic Latin (Ascii)
         INRANGE(w, 0x3041, 0x3096) ||  // Hiragana
         INRANGE(w, 0x309B, 0x309C) ||  // KATAKANA-HIRAGANA VOICED/SEMI-VOICED
                                        // SOUND MARK
         INRANGE(w, 0x30FB, 0x30FC) ||  // Nakaten, Prolonged sound mark
         INRANGE(w, 0x3001, 0x3002) ||  // Japanese punctuation marks
         INRANGE(w, 0x300C, 0x300F) ||  // Japanese brackets
         INRANGE(w, 0x301C, 0x301C)) {  // Japanese Wavedash
       begin += mblen;
     } else {
       LOG(INFO) << "Invalid character in reading.";
       return false;
     }
   }
   return true;
 }

 #undef INRANGE

 }  // namespace

 bool UserDictionaryUtil::IsValidReading(const string &reading) {
   string normalized;
   NormalizeReading(reading, &normalized);
   return InternalValidateNormalizedReading(normalized);
 }

 void UserDictionaryUtil::NormalizeReading(const string &input, string *output) {
   output->clear();
   string tmp1, tmp2;
   Util::FullWidthAsciiToHalfWidthAscii(input, &tmp1);
   Util::HalfWidthKatakanaToFullWidthKatakana(tmp1, &tmp2);
   Util::KatakanaToHiragana(tmp2, output);
 }

 string UserDictionaryUtil::GetUserDictionaryFileName() {
   return ConfigFileStream::GetFileName(kUserDictionaryFile);
 }

 // static
 bool UserDictionaryUtil::SanitizeEntry(
     UserDictionaryStorage::UserDictionaryEntry *entry) {
   bool modified = false;
   modified |= Sanitize(entry->mutable_key(), kMaxKeySize);
   modified |= Sanitize(entry->mutable_value(), kMaxValueSize);
   modified |= Sanitize(entry->mutable_pos(), kMaxPOSSize);
   modified |= Sanitize(entry->mutable_comment(), kMaxCommentSize);
   return modified;
 }

 // static
 bool UserDictionaryUtil::Sanitize(string *str, size_t max_size) {
   // First part: Remove invalid characters.
   {
     const size_t original_size = str->size();
     string::iterator begin = str->begin();
     string::iterator end = str->end();
     end = remove(begin, end, '\t');
     end = remove(begin, end, '\n');
     end = remove(begin, end, '\r');

     if (end - begin <= max_size) {
       if (end - begin == original_size) {
         return false;
       } else {
         str->erase(end - begin);
         return true;
       }
     }
   }

   // Second part: Truncate long strings.
   {
     const char *begin = str->data();
     const char *p = begin;
     const char *end = begin + str->size();
     while (p < end) {
       const size_t len = Util::OneCharLen(p);
       if ((p + len - begin) > max_size) {
         str->erase(p - begin);
         return true;
       }
       p += len;
     }
     LOG(FATAL) <<
         "There should be a bug in implementation of the function.";
   }

   return true;
 }
 }  // namespace mozc
	// Copyright 2010, Google Inc.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#include "dictionary/user_dictionary_util.h"

	#include <string.h>
	#include <algorithm>
	#include "base/base.h"
	#include "base/config_file_stream.h"
	#include "base/file_stream.h"
	#include "base/util.h"
	#include "converter/pos.h"

	namespace mozc {

	namespace {
	// Maximum string length in UserDictionaryEntry's field
	const size_t kMaxKeySize = 300;
	const size_t kMaxValueSize = 300;
	const size_t kMaxPOSSize = 300;
	const size_t kMaxCommentSize = 300;
	const char kInvalidChars[]= "\n\r\t";
	const char kUserDictionaryFile[] = "user://user_dictionary.db";
	}

	// TODO(keni): Write unit tests for this function.
	bool UserDictionaryUtil::IsValidEntry(
	const UserDictionaryStorage::UserDictionaryEntry &entry) {
	if (entry.key().empty()) {
	VLOG(1) << "key is empty";
	return false;
	}
	if (entry.key().find_first_of(kInvalidChars) != string::npos) {
	VLOG(1) << "Invalid character in key.";
	return false;
	}
	if (entry.key().size() > kMaxKeySize) {
	VLOG(1) << "Too long key.";
	return false;
	}
	if (entry.value().find_first_of(kInvalidChars) != string::npos) {
	VLOG(1) << "Invalid character in value.";
	return false;
	}
	if (entry.value().size() > kMaxValueSize) {
	VLOG(1) << "Too long value.";
	return false;
	}
	if (entry.pos().find_first_of(kInvalidChars) != string::npos) {
	VLOG(1) << "Invalid character in POS.";
	return false;
	}
	if (entry.pos().size() > kMaxPOSSize) {
	VLOG(1) << "Too long POS.";
	return false;
	}
	if (entry.comment().find_first_of(kInvalidChars) != string::npos) {
	VLOG(1) << "Invalid character in comment.";
	return false;
	}
	if (entry.comment().size() > kMaxCommentSize) {
	VLOG(1) << "Too long comment.";
	return false;
	}
	if (!UserDictionaryUtil::IsValidReading(entry.key())) {
	VLOG(1) << "Invalid reading";
	return false;
	}
	if (!POS::IsValidPOS(entry.pos())) {
	VLOG(1) << "Invalid POS";
	return false;
	}

	return true;
	}

	namespace {

	#define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b))

	bool InternalValidateNormalizedReading(const string &normalized_reading) {
	const char *begin = normalized_reading.c_str();
	const char *end = begin + normalized_reading.size();
	size_t mblen = 0;
	while (begin < end) {
	const uint16 w = Util::UTF8ToUCS2(begin, end, &mblen);
	if (INRANGE(w, 0x0021, 0x007E) \|\| // Basic Latin (Ascii)
	INRANGE(w, 0x3041, 0x3096) \|\| // Hiragana
	INRANGE(w, 0x309B, 0x309C) \|\| // KATAKANA-HIRAGANA VOICED/SEMI-VOICED
	// SOUND MARK
	INRANGE(w, 0x30FB, 0x30FC) \|\| // Nakaten, Prolonged sound mark
	INRANGE(w, 0x3001, 0x3002) \|\| // Japanese punctuation marks
	INRANGE(w, 0x300C, 0x300F) \|\| // Japanese brackets
	INRANGE(w, 0x301C, 0x301C)) { // Japanese Wavedash
	begin += mblen;
	} else {
	LOG(INFO) << "Invalid character in reading.";
	return false;
	}
	}
	return true;
	}

	#undef INRANGE

	} // namespace

	bool UserDictionaryUtil::IsValidReading(const string &reading) {
	string normalized;
	NormalizeReading(reading, &normalized);
	return InternalValidateNormalizedReading(normalized);
	}

	void UserDictionaryUtil::NormalizeReading(const string &input, string *output) {
	output->clear();
	string tmp1, tmp2;
	Util::FullWidthAsciiToHalfWidthAscii(input, &tmp1);
	Util::HalfWidthKatakanaToFullWidthKatakana(tmp1, &tmp2);
	Util::KatakanaToHiragana(tmp2, output);
	}

	string UserDictionaryUtil::GetUserDictionaryFileName() {
	return ConfigFileStream::GetFileName(kUserDictionaryFile);
	}

	// static
	bool UserDictionaryUtil::SanitizeEntry(
	UserDictionaryStorage::UserDictionaryEntry *entry) {
	bool modified = false;
	modified \|= Sanitize(entry->mutable_key(), kMaxKeySize);
	modified \|= Sanitize(entry->mutable_value(), kMaxValueSize);
	modified \|= Sanitize(entry->mutable_pos(), kMaxPOSSize);
	modified \|= Sanitize(entry->mutable_comment(), kMaxCommentSize);
	return modified;
	}

	// static
	bool UserDictionaryUtil::Sanitize(string *str, size_t max_size) {
	// First part: Remove invalid characters.
	{
	const size_t original_size = str->size();
	string::iterator begin = str->begin();
	string::iterator end = str->end();
	end = remove(begin, end, '\t');
	end = remove(begin, end, '\n');
	end = remove(begin, end, '\r');

	if (end - begin <= max_size) {
	if (end - begin == original_size) {
	return false;
	} else {
	str->erase(end - begin);
	return true;
	}
	}
	}

	// Second part: Truncate long strings.
	{
	const char *begin = str->data();
	const char *p = begin;
	const char *end = begin + str->size();
	while (p < end) {
	const size_t len = Util::OneCharLen(p);
	if ((p + len - begin) > max_size) {
	str->erase(p - begin);
	return true;
	}
	p += len;
	}
	LOG(FATAL) <<
	"There should be a bug in implementation of the function.";
	}

	return true;
	}
	} // namespace mozc