regex.h

Go to the documentation of this file.
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2004, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 // #define REGEX_DEBUG 00020 00040 #include "unicode/utypes.h" 00041 00042 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00043 00044 #include "unicode/uobject.h" 00045 #include "unicode/unistr.h" 00046 #include "unicode/parseerr.h" 00047 00048 #include "unicode/uregex.h" 00049 00050 U_NAMESPACE_BEGIN 00051 00052 00053 // Forward Declarations... 00054 00055 class RegexMatcher; 00056 class RegexPattern; 00057 class UVector; 00058 class UVector32; 00059 class UnicodeSet; 00060 struct REStackFrame; 00061 struct Regex8BitSet; 00062 class RuleBasedBreakIterator; 00063 class RegexCImpl; 00064 00065 00066 00067 00072 #ifdef REGEX_DEBUG 00073 U_INTERNAL void U_EXPORT2 00074 RegexPatternDump(const RegexPattern *pat); 00075 #else 00076 #define RegexPatternDump(pat) 00077 #endif 00078 00079 00080 00092 class U_I18N_API RegexPattern: public UObject { 00093 public: 00094 00102 RegexPattern(); 00103 00109 RegexPattern(const RegexPattern &source); 00110 00116 virtual ~RegexPattern(); 00117 00126 UBool operator==(const RegexPattern& that) const; 00127 00136 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; 00137 00143 RegexPattern &operator =(const RegexPattern &source); 00144 00152 virtual RegexPattern *clone() const; 00153 00154 00175 static RegexPattern *compile( const UnicodeString &regex, 00176 UParseError &pe, 00177 UErrorCode &status); 00178 00199 static RegexPattern *compile( const UnicodeString &regex, 00200 uint32_t flags, 00201 UParseError &pe, 00202 UErrorCode &status); 00203 00204 00223 static RegexPattern *compile( const UnicodeString &regex, 00224 uint32_t flags, 00225 UErrorCode &status); 00226 00227 00233 virtual uint32_t flags() const; 00234 00247 virtual RegexMatcher *matcher(const UnicodeString &input, 00248 UErrorCode &status) const; 00249 00250 00262 virtual RegexMatcher *matcher(UErrorCode &status) const; 00263 00264 00279 static UBool matches(const UnicodeString &regex, 00280 const UnicodeString &input, 00281 UParseError &pe, 00282 UErrorCode &status); 00283 00284 00289 virtual UnicodeString pattern() const; 00290 00291 00317 virtual int32_t split(const UnicodeString &input, 00318 UnicodeString dest[], 00319 int32_t destCapacity, 00320 UErrorCode &status) const; 00321 00322 00328 virtual UClassID getDynamicClassID() const; 00329 00335 static UClassID getStaticClassID(); 00336 00337 private: 00338 // 00339 // Implementation Data 00340 // 00341 UnicodeString fPattern; // The original pattern string. 00342 uint32_t fFlags; // The flags used when compiling the pattern. 00343 // 00344 UVector32 *fCompiledPat; // The compiled pattern p-code. 00345 UnicodeString fLiteralText; // Any literal string data from the pattern, 00346 // after un-escaping, for use during the match. 00347 00348 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00349 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00350 00351 00352 UErrorCode fDeferredStatus; // status if some prior error has left this 00353 // RegexPattern in an unusable state. 00354 00355 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00356 // >= this value. For some patterns, this calculated 00357 // value may be less than the true shortest 00358 // possible match. 00359 00360 int32_t fFrameSize; // Size of a state stack frame in the 00361 // execution engine. 00362 00363 int32_t fDataSize; // The size of the data needed by the pattern that 00364 // does not go on the state stack, but has just 00365 // a single copy per matcher. 00366 00367 UVector32 *fGroupMap; // Map from capture group number to position of 00368 // the group's variables in the matcher stack frame. 00369 00370 int32_t fMaxCaptureDigits; 00371 00372 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00373 // regex character classes, e.g. Word. 00374 00375 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00376 // sets for predefined regex classes. 00377 00378 int32_t fStartType; // Info on how a match must start. 00379 int32_t fInitialStringIdx; // 00380 int32_t fInitialStringLen; 00381 UnicodeSet *fInitialChars; 00382 UChar32 fInitialChar; 00383 Regex8BitSet *fInitialChars8; 00384 00385 friend class RegexCompile; 00386 friend class RegexMatcher; 00387 friend class RegexCImpl; 00388 00389 // 00390 // Implementation Methods 00391 // 00392 void init(); // Common initialization, for use by constructors. 00393 void zap(); // Common cleanup 00394 #ifdef REGEX_DEBUG 00395 void dumpOp(int32_t index) const; 00396 friend void RegexPatternDump(const RegexPattern *); 00397 #endif 00398 00399 }; 00400 00401 00402 00412 class U_I18N_API RegexMatcher: public UObject { 00413 public: 00414 00429 RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status); 00430 00446 RegexMatcher(const UnicodeString &regexp, const UnicodeString &input, 00447 uint32_t flags, UErrorCode &status); 00448 00449 00455 virtual ~RegexMatcher(); 00456 00457 00464 virtual UBool matches(UErrorCode &status); 00465 00474 virtual UBool matches(int32_t startIndex, UErrorCode &status); 00475 00476 00477 00478 00491 virtual UBool lookingAt(UErrorCode &status); 00492 00493 00507 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status); 00508 00521 virtual UBool find(); 00522 00523 00533 virtual UBool find(int32_t start, UErrorCode &status); 00534 00535 00545 virtual UnicodeString group(UErrorCode &status) const; 00546 00547 00560 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00561 00562 00568 virtual int32_t groupCount() const; 00569 00570 00578 virtual int32_t start(UErrorCode &status) const; 00579 00580 00594 virtual int32_t start(int group, UErrorCode &status) const; 00595 00596 00606 virtual int32_t end(UErrorCode &status) const; 00607 00608 00622 virtual int32_t end(int group, UErrorCode &status) const; 00623 00624 00633 virtual RegexMatcher &reset(); 00634 00635 00645 virtual RegexMatcher &reset(int32_t index, UErrorCode &status); 00646 00647 00655 virtual RegexMatcher &reset(const UnicodeString &input); 00656 00657 00664 virtual const UnicodeString &input() const; 00665 00666 00672 virtual const RegexPattern &pattern() const; 00673 00674 00691 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 00692 00693 00714 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 00715 00743 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 00744 const UnicodeString &replacement, UErrorCode &status); 00745 00746 00757 virtual UnicodeString &appendTail(UnicodeString &dest); 00758 00759 00760 00785 virtual int32_t split(const UnicodeString &input, 00786 UnicodeString dest[], 00787 int32_t destCapacity, 00788 UErrorCode &status); 00789 00790 00791 00797 void setTrace(UBool state); 00798 00799 00805 static UClassID getStaticClassID(); 00806 00812 virtual UClassID getDynamicClassID() const; 00813 00814 private: 00815 // Constructors and other object boilerplate are private. 00816 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 00817 RegexMatcher(); // default constructor not implemented 00818 RegexMatcher(const RegexPattern *pat); 00819 RegexMatcher(const RegexMatcher &other); 00820 RegexMatcher &operator =(const RegexMatcher &rhs); 00821 friend class RegexPattern; 00822 friend class RegexCImpl; 00823 00824 00825 // 00826 // MatchAt This is the internal interface to the match engine itself. 00827 // Match status comes back in matcher member variables. 00828 // 00829 void MatchAt(int32_t startIdx, UErrorCode &status); 00830 inline void backTrack(int32_t &inputIdx, int32_t &patIdx); 00831 UBool isWordBoundary(int32_t pos); // perform Perl-like \b test 00832 UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test 00833 REStackFrame *resetStack(); 00834 inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, 00835 int32_t frameSize, UErrorCode &status); 00836 00837 00838 const RegexPattern *fPattern; 00839 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 00840 // should delete it when through. 00841 const UnicodeString *fInput; 00842 00843 UBool fMatch; // True if the last match was successful. 00844 int32_t fMatchStart; // Position of the start of the most recent match 00845 int32_t fMatchEnd; // First position after the end of the most recent match 00846 int32_t fLastMatchEnd; // First position after the end of the previous match. 00847 00848 UVector32 *fStack; 00849 REStackFrame *fFrame; // After finding a match, the last active stack 00850 // frame, which will contain the capture group results. 00851 // NOT valid while match engine is running. 00852 00853 int32_t *fData; // Data area for use by the compiled pattern. 00854 int32_t fSmallData[8]; // Use this for data if it's enough. 00855 00856 UBool fTraceDebug; // Set true for debug tracing of match engine. 00857 00858 UErrorCode fDeferredStatus; // Save error state if that cannot be immediately 00859 // reported, or that permanently disables this matcher. 00860 00861 RuleBasedBreakIterator *fWordBreakItr; 00862 00863 00864 }; 00865 00866 U_NAMESPACE_END 00867 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 00868 #endif

Generated on Fri Jun 18 12:35:58 2004 for ICU by doxygen 1.3.7