ICU 59.1  59.1
rbbi.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation *
6 * and others. All rights reserved. *
7 ***************************************************************************
8 
9 **********************************************************************
10 * Date Name Description
11 * 10/22/99 alan Creation.
12 * 11/11/99 rgillam Complete port from Java.
13 **********************************************************************
14 */
15 
16 #ifndef RBBI_H
17 #define RBBI_H
18 
19 #include "unicode/utypes.h"
20 
26 #if !UCONFIG_NO_BREAK_ITERATION
27 
28 #include "unicode/brkiter.h"
29 #include "unicode/udata.h"
30 #include "unicode/parseerr.h"
31 #include "unicode/schriter.h"
32 #include "unicode/uchriter.h"
33 
34 
35 struct UTrie;
36 
38 
40 struct RBBIDataHeader;
41 class RuleBasedBreakIteratorTables;
42 class BreakIterator;
43 class RBBIDataWrapper;
44 class UStack;
45 class LanguageBreakEngine;
46 class UnhandledEngine;
47 struct RBBIStateTable;
48 
49 
50 
51 
64 
65 private:
70  UText *fText;
71 
77  CharacterIterator *fCharIter;
78 
84  StringCharacterIterator *fSCharIter;
85 
91  UCharCharacterIterator *fDCharIter;
92 
97  RBBIDataWrapper *fData;
98 
102  int32_t fLastRuleStatusIndex;
103 
110  UBool fLastStatusIndexValid;
111 
117  uint32_t fDictionaryCharCount;
118 
126  int32_t* fCachedBreakPositions;
127 
132  int32_t fNumCachedBreakPositions;
133 
139  int32_t fPositionInCache;
140 
148  UStack *fLanguageBreakEngines;
149 
157  UnhandledEngine *fUnhandledBreakEngine;
158 
164  int32_t fBreakType;
165 
166  //=======================================================================
167  // constructors
168  //=======================================================================
169 
180  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
181 
182 
183  friend class RBBIRuleBuilder;
185  friend class BreakIterator;
186 
187 
188 
189 public:
190 
196 
204 
214  UParseError &parseError,
215  UErrorCode &status);
216 
240  RuleBasedBreakIterator(const uint8_t *compiledRules,
241  uint32_t ruleLength,
242  UErrorCode &status);
243 
257 
262  virtual ~RuleBasedBreakIterator();
263 
271  RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
272 
281  virtual UBool operator==(const BreakIterator& that) const;
282 
290  UBool operator!=(const BreakIterator& that) const;
291 
302  virtual BreakIterator* clone() const;
303 
309  virtual int32_t hashCode(void) const;
310 
316  virtual const UnicodeString& getRules(void) const;
317 
318  //=======================================================================
319  // BreakIterator overrides
320  //=======================================================================
321 
347  virtual CharacterIterator& getText(void) const;
348 
349 
364  virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
365 
373  virtual void adoptText(CharacterIterator* newText);
374 
386  virtual void setText(const UnicodeString& newText);
387 
401  virtual void setText(UText *text, UErrorCode &status);
402 
408  virtual int32_t first(void);
409 
415  virtual int32_t last(void);
416 
427  virtual int32_t next(int32_t n);
428 
434  virtual int32_t next(void);
435 
441  virtual int32_t previous(void);
442 
450  virtual int32_t following(int32_t offset);
451 
459  virtual int32_t preceding(int32_t offset);
460 
469  virtual UBool isBoundary(int32_t offset);
470 
476  virtual int32_t current(void) const;
477 
478 
511  virtual int32_t getRuleStatus() const;
512 
536  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
537 
549  virtual UClassID getDynamicClassID(void) const;
550 
562  static UClassID U_EXPORT2 getStaticClassID(void);
563 
590  virtual BreakIterator * createBufferClone(void *stackBuffer,
591  int32_t &BufferSize,
592  UErrorCode &status);
593 
594 
612  virtual const uint8_t *getBinaryRules(uint32_t &length);
613 
639  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
640 
641 
642 private:
643  //=======================================================================
644  // implementation
645  //=======================================================================
651  void reset(void);
652 
657  void setBreakType(int32_t type);
658 
663  void init();
664 
674  int32_t handlePrevious(const RBBIStateTable *statetable);
675 
685  int32_t handleNext(const RBBIStateTable *statetable);
686 
687 
702  int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
703 
704 
711  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
712 
716  void makeRuleStatusValid();
717 
718 };
719 
720 //------------------------------------------------------------------------------
721 //
722 // Inline Functions Definitions ...
723 //
724 //------------------------------------------------------------------------------
725 
727  return !operator==(that);
728 }
729 
731 
732 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
733 
734 #endif
C++ API: Break Iterator.
virtual int32_t next(void)=0
Advance the iterator to the boundary following the current boundary.
virtual UBool isBoundary(int32_t offset)=0
Return true if the specfied position is a boundary position.
virtual void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:93
virtual CharacterIterator & getText(void) const =0
Return a CharacterIterator over the text being analyzed.
virtual UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
virtual int32_t first(void)=0
Sets the current iteration position to the beginning of the text, position zero.
virtual int32_t following(int32_t offset)=0
Advance the iterator to the first boundary following the specified offset.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:131
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:358
C++ API: String Character Iterator.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:35
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: schriter.h:45
The BreakIterator class implements methods for finding the location of boundaries in text...
Definition: brkiter.h:102
virtual int32_t last(void)=0
Set the iterator position to the index immediately BEYOND the last character in the text being scanne...
virtual int32_t current(void) const =0
Return character index of the current interator position within the text.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:396
UBool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:726
virtual UBool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status)
For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) that determined the m...
C API: Data loading interface.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:158
virtual int32_t previous(void)=0
Set the iterator position to the boundary preceding the current boundary.
virtual UClassID getDynamicClassID(void) const =0
Return a polymorphic class ID for this object.
virtual void setText(const UnicodeString &text)=0
Change the text over which this operates.
virtual BreakIterator & refreshInputText(UText *input, UErrorCode &status)=0
Set the subject text string upon which the break iterator is operating without changing any other asp...
virtual BreakIterator * clone(void) const =0
Return a polymorphic copy of this object.
C++ API: char16_t Character Iterator.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:132
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:396
virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Deprecated functionality.
virtual int32_t getRuleStatus() const
For RuleBasedBreakIterators, return the status tag from the break rule that determined the most recen...
UText struct.
Definition: utext.h:1345
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:63
virtual int32_t preceding(int32_t offset)=0
Set the iterator position to the first boundary preceding the specified offset.
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:359
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:296
UBool operator!=(const BreakIterator &rhs) const
Returns the complement of the result of operator==.
Definition: brkiter.h:131
int8_t UBool
The ICU boolean type.
Definition: umachine.h:236