ICU 59.1  59.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if !UCONFIG_NO_NORMALIZATION
30 
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/unorm2.h"
34 
36 
81 public:
86  ~Normalizer2();
87 
99  static const Normalizer2 *
100  getNFCInstance(UErrorCode &errorCode);
101 
113  static const Normalizer2 *
114  getNFDInstance(UErrorCode &errorCode);
115 
127  static const Normalizer2 *
128  getNFKCInstance(UErrorCode &errorCode);
129 
141  static const Normalizer2 *
142  getNFKDInstance(UErrorCode &errorCode);
143 
155  static const Normalizer2 *
156  getNFKCCasefoldInstance(UErrorCode &errorCode);
157 
179  static const Normalizer2 *
180  getInstance(const char *packageName,
181  const char *name,
182  UNormalization2Mode mode,
183  UErrorCode &errorCode);
184 
196  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
197  UnicodeString result;
198  normalize(src, result, errorCode);
199  return result;
200  }
214  virtual UnicodeString &
215  normalize(const UnicodeString &src,
216  UnicodeString &dest,
217  UErrorCode &errorCode) const = 0;
232  virtual UnicodeString &
233  normalizeSecondAndAppend(UnicodeString &first,
234  const UnicodeString &second,
235  UErrorCode &errorCode) const = 0;
250  virtual UnicodeString &
251  append(UnicodeString &first,
252  const UnicodeString &second,
253  UErrorCode &errorCode) const = 0;
254 
268  virtual UBool
269  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
270 
295  virtual UBool
296  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
297 
313  virtual UChar32
314  composePair(UChar32 a, UChar32 b) const;
315 
324  virtual uint8_t
325  getCombiningClass(UChar32 c) const;
326 
341  virtual UBool
342  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
343 
360  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
361 
384  virtual int32_t
385  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
386 
400  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
401 
416  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
417 
431  virtual UBool isInert(UChar32 c) const = 0;
432 };
433 
446 public:
457  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
458  norm2(n2), set(filterSet) {}
459 
465 
479  virtual UnicodeString &
480  normalize(const UnicodeString &src,
481  UnicodeString &dest,
482  UErrorCode &errorCode) const;
497  virtual UnicodeString &
499  const UnicodeString &second,
500  UErrorCode &errorCode) const;
515  virtual UnicodeString &
516  append(UnicodeString &first,
517  const UnicodeString &second,
518  UErrorCode &errorCode) const;
519 
531  virtual UBool
532  getDecomposition(UChar32 c, UnicodeString &decomposition) const;
533 
545  virtual UBool
546  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
547 
558  virtual UChar32
559  composePair(UChar32 a, UChar32 b) const;
560 
569  virtual uint8_t
570  getCombiningClass(UChar32 c) const;
571 
583  virtual UBool
584  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
597  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
609  virtual int32_t
610  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
611 
620  virtual UBool hasBoundaryBefore(UChar32 c) const;
621 
630  virtual UBool hasBoundaryAfter(UChar32 c) const;
631 
639  virtual UBool isInert(UChar32 c) const;
640 private:
641  UnicodeString &
642  normalize(const UnicodeString &src,
643  UnicodeString &dest,
644  USetSpanCondition spanCondition,
645  UErrorCode &errorCode) const;
646 
647  UnicodeString &
649  const UnicodeString &second,
650  UBool doNormalize,
651  UErrorCode &errorCode) const;
652 
653  const Normalizer2 &norm2;
654  const UnicodeSet &set;
655 };
656 
658 
659 #endif // !UCONFIG_NO_NORMALIZATION
660 #endif // __NORMALIZER2_H__
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:196
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:131
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:80
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:396
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:457
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:278
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:152
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:132
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:44
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:396
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:359
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:445
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:93
int8_t UBool
The ICU boolean type.
Definition: umachine.h:236
C++ API: Unicode Set.