Main Page   Class Hierarchy   Compound List   File List   Header Files   Sources   Compound Members   File Members  

unicode.h

This is the verbatim text of the unicode.h include file.
/*
*****************************************************************************************
*   Copyright (C) 1996-1999, International Business Machines
*   Corporation and others.  All Rights Reserved.
*****************************************************************************************
*/
//  FILE NAME : unicode.h
//
//  CREATED
//      Wednesday, December 11, 1996
//
//  CREATED BY
//      Helena Shih
//
//  CHANGES
//      Thursday, April 15, 1999
//      Modified the definitions of all the functions
//      C++ Wrappers for Unicode
//  CHANGES BY
//      Madhu Katragadda
//   5/20/99     Madhu          Added the function getVersion()
//  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
//********************************************************************************************
   
         

#ifndef UNICODE_H
#define UNICODE_H

#include "unicode/utypes.h"
#include "unicode/uchar.h"

class U_COMMON_API Unicode
{
public:
    /*
     * In C++, static const members actually take up memory and need to be accessed.
     * enum values are more like C #define's.
     * The following is a collection of constants, not an enumeration type.
     */
    enum {
        MIN_VALUE=0,

        MAX_VALUE=0x10ffff,

        MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,

        MIN_RADIX=2,

        MAX_RADIX=36
    };

    enum EUnicodeGeneralTypes
    {
        UNASSIGNED              = 0,
        UPPERCASE_LETTER        = 1,
        LOWERCASE_LETTER        = 2,
        TITLECASE_LETTER        = 3,
        MODIFIER_LETTER         = 4,
        OTHER_LETTER            = 5,
        NON_SPACING_MARK        = 6,
        ENCLOSING_MARK          = 7,
        COMBINING_SPACING_MARK  = 8,
        DECIMAL_DIGIT_NUMBER    = 9,
        LETTER_NUMBER           = 10,
        OTHER_NUMBER            = 11,
        SPACE_SEPARATOR         = 12,
        LINE_SEPARATOR          = 13,
        PARAGRAPH_SEPARATOR     = 14,
        CONTROL                 = 15,
        FORMAT                  = 16,
        PRIVATE_USE             = 17,
        SURROGATE               = 18,
        DASH_PUNCTUATION        = 19,
        START_PUNCTUATION       = 20,
        END_PUNCTUATION         = 21,
                CONNECTOR_PUNCTUATION   = 22,
        OTHER_PUNCTUATION       = 23,
        MATH_SYMBOL             = 24,
        CURRENCY_SYMBOL         = 25,
        MODIFIER_SYMBOL         = 26,
        OTHER_SYMBOL            = 27,
                INITIAL_PUNCTUATION     = 28,
                FINAL_PUNCTUATION       = 29,
        GENERAL_TYPES_COUNT     = 30
    };

    enum EUnicodeScript 
    {
        kBasicLatin,
        kLatin1Supplement,
        kLatinExtendedA,
        kLatinExtendedB,
        kIPAExtension,
        kSpacingModifier,
        kCombiningDiacritical,
        kGreek,
        kCyrillic,
        kArmenian,
        kHebrew,
        kArabic,
        kDevanagari,
        kBengali,
        kGurmukhi,
        kGujarati,
        kOriya,
        kTamil,
        kTelugu,
        kKannada,
        kMalayalam,
        kThai,
        kLao,
        kTibetan,
        kGeorgian,
        kHangulJamo,
        kLatinExtendedAdditional,
        kGreekExtended,
        kGeneralPunctuation,
        kSuperSubScript,
        kCurrencySymbolScript,
        kSymbolCombiningMark,
        kLetterlikeSymbol,
        kNumberForm,
        kArrow,
        kMathOperator,
        kMiscTechnical,
        kControlPicture,
        kOpticalCharacter,
        kEnclosedAlphanumeric,
        kBoxDrawing,
        kBlockElement,
        kGeometricShape,
        kMiscSymbol,
        kDingbat,
        kCJKSymbolPunctuation,
        kHiragana,
        kKatakana,
        kBopomofo,
        kHangulCompatibilityJamo,
        kKanbun,
        kEnclosedCJKLetterMonth,
        kCJKCompatibility,
        kCJKUnifiedIdeograph,
        kHangulSyllable,
        kHighSurrogate,
        kHighPrivateUseSurrogate,
        kLowSurrogate,
        kPrivateUse,
        kCJKCompatibilityIdeograph,
        kAlphabeticPresentation,
        kArabicPresentationA,
        kCombiningHalfMark,
        kCJKCompatibilityForm,
        kSmallFormVariant,
        kArabicPresentationB,
        kNoScript,
        kHalfwidthFullwidthForm,
        kScriptCount
    };

    enum EDirectionProperty { 
        LEFT_TO_RIGHT               = 0, 
                RIGHT_TO_LEFT               = 1, 
                EUROPEAN_NUMBER             = 2,
                EUROPEAN_NUMBER_SEPARATOR   = 3,
                EUROPEAN_NUMBER_TERMINATOR  = 4,
                ARABIC_NUMBER               = 5,
                COMMON_NUMBER_SEPARATOR     = 6,
                BLOCK_SEPARATOR             = 7,
                SEGMENT_SEPARATOR           = 8,
                WHITE_SPACE_NEUTRAL         = 9, 
                OTHER_NEUTRAL               = 10, 
                LEFT_TO_RIGHT_EMBEDDING     = 11,
                LEFT_TO_RIGHT_OVERRIDE      = 12,
                RIGHT_TO_LEFT_ARABIC        = 13,
                RIGHT_TO_LEFT_EMBEDDING     = 14,
                RIGHT_TO_LEFT_OVERRIDE      = 15,
                POP_DIRECTIONAL_FORMAT      = 16,
                DIR_NON_SPACING_MARK        = 17,
                BOUNDARY_NEUTRAL            = 18
    };
    
    enum ECellWidths
    {
        ZERO_WIDTH              = 0,
        HALF_WIDTH              = 1,
        FULL_WIDTH              = 2,
        NEUTRAL                 = 3
    };

    static inline UBool isSingle(UChar c);

    static inline UBool isLead(UChar c);

    static inline UBool isTrail(UChar c);

    static inline UBool isSurrogate(UChar32 c);

    static inline UBool isUnicodeChar(UChar32 c);

    static inline UBool isError(UChar32 c);

    static inline UBool isValid(UChar32 c);

    static inline UBool needMultipleUChar(UChar32 c);

    static inline int32_t charLength(UChar32 c);

    static inline int32_t arraySize(int32_t size);

    static inline UBool isLowerCase(UChar32 ch);

    static inline UBool isUpperCase(UChar32 ch);

    static inline UBool isTitleCase(UChar32 ch);

    static inline UBool isDigit(UChar32 ch);

    static inline UBool isDefined(UChar32 ch);

    static inline UBool isControl(UChar32 ch);

    static inline UBool isPrintable(UChar32 ch);

     static inline UBool isBaseForm(UChar32 ch);

    static inline UBool isLetter(UChar32 ch);

    static inline UBool isJavaIdentifierStart(UChar32 ch);

    static inline UBool isJavaIdentifierPart(UChar32 ch);

    static inline UBool isUnicodeIdentifierStart(UChar32 ch);

    static inline UBool isUnicodeIdentifierPart(UChar32 ch);

    static inline UBool isIdentifierIgnorable(UChar32 ch);

   static inline UChar32 toLowerCase(UChar32 ch); 

    static inline UChar32 toUpperCase(UChar32 ch);

    static inline UChar32 toTitleCase(UChar32 ch);

    static inline UBool isSpaceChar(UChar32 ch);

    static inline UBool isWhitespace(UChar32 ch);

    static inline int8_t getType(UChar32 ch);

    static inline EDirectionProperty characterDirection(UChar32 ch);

    static inline UBool isMirrored(UChar32 c);

    static inline UChar32 charMirror(UChar32 c);

    static inline EUnicodeScript getScript(UChar32 ch);

    static inline uint16_t getCellWidth(UChar32 ch);

    static inline UTextOffset
    getCharName(uint32_t code,
                char *buffer, UTextOffset bufferLength,
                UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);

    static inline int32_t digitValue(UChar32 ch);     

    static inline int8_t digit(UChar32 ch, int8_t radix);
        
    static inline UChar32 forDigit(int32_t digit, int8_t radix);

        static void getUnicodeVersion(UVersionInfo info);

protected:
    // These constructors, destructor, and assignment operator must
    // be protected (not private, as they semantically are) to make
    // various UNIX compilers happy. [LIU]
    // They should be private to prevent anyone from instantiating or
    // subclassing Unicode.
    Unicode();
    Unicode(const Unicode &other);
    ~Unicode();
    const Unicode &operator=(const Unicode &other);
};

/* inline implementations --------------------------------------------------- */

inline UBool
Unicode::isSingle(UChar c) {
    return UTF_IS_SINGLE(c);
}

inline UBool
Unicode::isLead(UChar c) {
    return UTF_IS_LEAD(c);
}

inline UBool
Unicode::isTrail(UChar c) {
    return UTF_IS_TRAIL(c);
}

inline UBool
Unicode::isSurrogate(UChar32 c) {
    return UTF_IS_SURROGATE(c);
}

inline UBool
Unicode::isUnicodeChar(UChar32 c) {
    return UTF_IS_UNICODE_CHAR(c);
}

inline UBool
Unicode::isError(UChar32 c) {
    return UTF_IS_ERROR(c);
}

inline UBool
Unicode::isValid(UChar32 c) {
    return UTF_IS_VALID(c);
}

inline UBool
Unicode::needMultipleUChar(UChar32 c) {
    return UTF_NEED_MULTIPLE_UCHAR(c);
}

inline int32_t
Unicode::charLength(UChar32 c) {
    return UTF_CHAR_LENGTH(c);
}

inline int32_t
Unicode::arraySize(int32_t size) {
    return UTF_ARRAY_SIZE(size);
}

// Checks if ch is a lower case letter.
inline UBool
Unicode::isLowerCase(UChar32 ch) {
    return u_islower(ch);
}

// Checks if ch is a upper case letter.
inline UBool
Unicode::isUpperCase(UChar32 ch) {
    return u_isupper(ch);
}

// Checks if ch is a title case letter; usually upper case letters.
inline UBool
Unicode::isTitleCase(UChar32 ch) {
    return u_istitle(ch);
}

// Checks if ch is a decimal digit.
inline UBool
Unicode::isDigit(UChar32 ch) {
    return u_isdigit(ch);
}

// Checks if ch is a unicode character with assigned character type.
inline UBool
Unicode::isDefined(UChar32 ch) {
    return u_isdefined(ch);
}

// Checks if the Unicode character is a control character.
inline UBool
Unicode::isControl(UChar32 ch) {
    return u_iscntrl(ch);
}

// Checks if the Unicode character is printable.
inline UBool
Unicode::isPrintable(UChar32 ch) {
    return u_isprint(ch);
}

// Checks if the Unicode character is a base form character that can take a diacritic.
inline UBool
Unicode::isBaseForm(UChar32 ch) {
    return u_isbase(ch);
}

// Checks if the Unicode character is a letter.
inline UBool
Unicode::isLetter(UChar32 ch) {
    return u_isalpha(ch);
}

// Checks if the Unicode character can start a Java identifier.
inline UBool
Unicode::isJavaIdentifierStart(UChar32 ch) {
    return u_isJavaIDStart(ch);
}

// Checks if the Unicode character can be a Java identifier part other than starting the
// identifier.
inline UBool
Unicode::isJavaIdentifierPart(UChar32 ch) {
    return u_isJavaIDPart(ch);
}

// Checks if the Unicode character can start a Unicode identifier.
inline UBool
Unicode::isUnicodeIdentifierStart(UChar32 ch) {
    return u_isIDStart(ch);
}

// Checks if the Unicode character can be a Unicode identifier part other than starting the
// identifier.
inline UBool
Unicode::isUnicodeIdentifierPart(UChar32 ch) {
    return u_isIDPart(ch);
}

// Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
inline UBool
Unicode::isIdentifierIgnorable(UChar32 ch) {
    return u_isIDIgnorable(ch);
}

// Transforms the Unicode character to its lower case equivalent.
inline UChar32       
Unicode::toLowerCase(UChar32 ch) {
    return u_tolower(ch);
}
    
// Transforms the Unicode character to its upper case equivalent.
inline UChar32
Unicode::toUpperCase(UChar32 ch) {
    return u_toupper(ch);
}

// Transforms the Unicode character to its title case equivalent.
inline UChar32
Unicode::toTitleCase(UChar32 ch) {
    return u_totitle(ch);
}

// Checks if the Unicode character is a space character.
inline UBool
Unicode::isSpaceChar(UChar32 ch) {
    return u_isspace(ch);
}

// Determines if the specified character is white space according to ICU.
inline UBool
Unicode::isWhitespace(UChar32 ch) {
    return u_isWhitespace(ch);
}

// Gets if the Unicode character's character property.
inline int8_t
Unicode::getType(UChar32 ch) {
    return u_charType(ch);
}

// Gets the character's linguistic directionality.
inline Unicode::EDirectionProperty
Unicode::characterDirection(UChar32 ch) {
    return (EDirectionProperty)u_charDirection(ch);
}

// Determines if the character has the "mirrored" property.
inline UBool
Unicode::isMirrored(UChar32 ch) {
    return u_isMirrored(ch);
}

// Maps the character to a "mirror-image" character, or to itself.
inline UChar32
Unicode::charMirror(UChar32 ch) {
    return u_charMirror(ch);
}

// Get the script associated with the character
inline Unicode::EUnicodeScript
Unicode::getScript(UChar32 ch) {
    return (EUnicodeScript) u_charScript(ch);
}

// Gets table cell width of the Unicode character.
inline uint16_t
Unicode::getCellWidth(UChar32 ch) {
    return u_charCellWidth(ch);
}

inline UTextOffset
Unicode::getCharName(uint32_t code,
                     char *buffer, UTextOffset bufferLength,
                     UCharNameChoice nameChoice) {
    UErrorCode errorCode=U_ZERO_ERROR;
    UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
    return U_SUCCESS(errorCode) ? length : 0;
}

inline int32_t            
Unicode::digitValue(UChar32 ch) {
    return u_charDigitValue(ch);
}

inline int8_t
Unicode::digit(UChar32 ch, int8_t radix) {
    // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here
    int8_t value;
    if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) {
        value=(int8_t)u_charDigitValue(ch);
        if(value<0) {
            // ch is not a decimal digit, try latin letters
            if ((uint32_t)(ch-0x41)<26) {
                value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A
            } else if ((uint32_t)(ch-0x61)<26) {
                value=(int8_t)(ch-(0x61-10)); // a-z, subtract a
            } else {
                return -1; // ch is not a digit character
            }
        }
    } else {
        return -1; // invalid radix
    }
    return (uint8_t)((value<radix) ? value : (uint8_t)(-1));
}

inline UChar32
Unicode::forDigit(int32_t digit, int8_t radix) {
    // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here
    if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
        return 0;
    } else if(digit<10) {
        return (UChar32)(0x30+digit);
    } else {
        return (UChar32)((0x61-10)+digit);
    }
}

inline void
Unicode::getUnicodeVersion(UVersionInfo versionArray) {
        u_getUnicodeVersion(versionArray);
}

#endif

Generated at Wed Aug 16 16:05:53 2000 for ICU1.6 by doxygen 1.0.0 written by Dimitri van Heesch, © 1997-1999