Simple Collation Sample Customization The following program demonstrates how to compare and create sort keys with default locale. In C:
#include <stdio.h> #include <memory.h> #include <string.h> #include "unicode/ustring.h" #include "unicode/utypes.h" #include "unicode/uloc.h" #include "unicode/ucol.h" #define MAXBUFFERSIZE 100 #define BIGBUFFERSIZE 5000 UBool collateWithLocaleInC(const char* locale, UErrorCode *status) { UChar dispName [MAXBUFFERSIZE]; int32_t bufferLen = 0; UChar source [MAXBUFFERSIZE]; UChar target [MAXBUFFERSIZE]; UCollationResult result = UCOL_EQUAL; uint8_t sourceKeyArray [MAXBUFFERSIZE]; uint8_t targetKeyArray [MAXBUFFERSIZE]; int32_t sourceKeyOut = 0, targetKeyOut = 0; UCollator *myCollator = 0; if (U_FAILURE(*status)) { return FALSE; } u_uastrcpy(source, "This is a test."); u_uastrcpy(target, "THIS IS A TEST."); myCollator = ucol_open(locale, status); if (U_FAILURE(*status)){ bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status); /*Report the error with display name... */ fprintf(stderr, "Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ ucol_setStrength(myCollator, UCOL_PRIMARY); result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; }
/* Now, do the same comparison with keys */ sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE); targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE); result = 0; result = strcmp(sourceKeyArray, targetKeyArray); if (result != 0) { fprintf(stderr, "Comparing two strings with sort keys in C failed.\n"); return FALSE; } ucol_close(myCollator); return TRUE; }
|
In C++:
#include <stdio.h> #include "unicode/unistr.h" #include "unicode/utypes.h" #include "unicode/locid.h" #include "unicode/coll.h" #include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/sortkey.h" UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status) { UnicodeString dispName; UnicodeString source("This is a test."); UnicodeString target("THIS IS A TEST."); Collator::EComparisonResult result = Collator::EQUAL; CollationKey sourceKey; CollationKey targetKey; Collator *myCollator = 0; if (U_FAILURE(status)) { return FALSE; } myCollator = Collator::createInstance(locale, status); if (U_FAILURE(status)){ locale.getDisplayName(dispName); /*Report the error with display name... */ fprintf(stderr, "%s: Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = myCollator->compare(source, target); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ myCollator->setStrength(Collator::PRIMARY); result = myCollator->compare(source, target); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; } /* Now, do the same comparison with keys */ myCollator->getCollationKey(source, sourceKey, status); myCollator->getCollationKey(target, targetKey, status); result = Collator::EQUAL;
result = sourceKey.compareTo(targetKey); if (result != 0) { fprintf(stderr, "%s: Comparing two strings with sort keys in C failed.\n"); return FALSE; } delete myCollator; return TRUE; }
|
Main Function
extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status); int main() { UErrorCode status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE) { fprintf(stderr, "Collate with locale in C++ failed.\n"); } else { fprintf(stdout, "Collate with Locale C++ example worked!!\n"); } status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInC("en_US", &status) != TRUE) { fprintf(stderr, "%s: Collate with locale in C failed.\n"); } else { fprintf(stdout, "Collate with Locale C example worked!!\n"); } return 0; }
|
In Java:
import com.ibm.icu.text.Collator; import com.ibm.icu.text.CollationElementIterator; import com.ibm.icu.text.CollationKey; import java.util.Locale;
public class CollateExample { public static void main(String arg[]) { CollateExample example = new CollateExample(); try { if (!example.collateWithLocale(Locale.US)) { System.err.println("Collate with locale example failed."); } else { System.out.println("Collate with Locale example worked!!"); } } catch (Exception e) { System.err.println("Collating with locale failed"); e.printStackTrace(); } } public boolean collateWithLocale(Locale locale) throws Exception { String source = "This is a test."; String target = "THIS IS A TEST."; Collator myCollator = Collator.getInstance(locale);
int result = myCollator.compare(source, target); // result is 1, secondary differences only for ignorable space characters if (result >= 0) { System.err.println( "Comparing two strings with only secondary differences failed."); return false; } // To compare them with just primary differences myCollator.setStrength(Collator.PRIMARY); result = myCollator.compare(source, target); // result is 0 if (result != 0) { System.err.println( "Comparing two strings with no differences failed."); return false; } // Now, do the same comparison with keys CollationKey sourceKey = myCollator.getCollationKey(source); CollationKey targetKey = myCollator.getCollationKey(target); result = sourceKey.compareTo(targetKey); if (result != 0) { System.err.println("Comparing two strings with sort keys failed."); return false; } return true; } }
|
Language-sensitive searchingString searching is a
well-researched area, and there are algorithms that can optimize the
searching process. Perhaps the best is the Boyer-Moore method. For full
textual description of concept behind the sample programs, please see
Laura Werner's text searching article for more details (http://icu-project.org/docs/papers/efficient_text_searching_in_java.html
). The source of the language-sensitive text searching based on ICU Collation Service can be found on the Internet at http://source.icu-project.org/repos/icu/icu/trunk/source/i18n/usearch.cpp
. Using large buffers to manage sort keysA
good solution for the problem of not knowing the sort key size in
advance is to allocate a large buffer and store all the sort keys
there, while keeping a list of indexes or pointers to that buffer. Following
is sample code that will take a pointer to an array of UChar pointer,
an array of key indexes. It will allocate and fill a buffer with sort
keys and return the maximum size for a sort key. Once you have done
this to your string, you just need to allocate a field of maximum size
and copy your sortkeys from the buffer to fields.
uint32_t
fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, uint32_t sourceSize, uint8_t **buffer, uint32_t *maxSize, UErrorCode *status) { if(status == NULL || U_FAILURE(*status)) { return 0; }
uint32_t bufferSize = 16384; uint32_t increment = 16384; uint32_t currentOffset = 0; uint32_t keySize = 0; uint32_t i = 0; *maxSize = 0;
*buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t)); if(buffer == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; }
for(i = 0; i < sourceSize; i++) { keys[i] = currentOffset; keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); if(keySize > bufferSize-currentOffset) { *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment); if(buffer == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } bufferSize += increment; keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); } /* here you can hook code that does something interesting with the keySize - * remembers the maximum or similar... */ if(keySize > *maxSize) { *maxSize = keySize; } currentOffset += keySize; }
return currentOffset; }
|
|