Collation‎ > ‎

Collation Examples

Simple Collation Sample Customization

The following program demonstrates how to compare and create sort keys with default locale.

In C:

       #include <stdio.h>
       #include <memory.h>
       #include <string.h>
       #include "unicode/ustring.h"
       #include "unicode/utypes.h"
       #include "unicode/uloc.h"
       #include "unicode/ucol.h"
       #define MAXBUFFERSIZE 100
       #define BIGBUFFERSIZE 5000
       UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
       {
           UChar         dispName    [MAXBUFFERSIZE];
           int32_t       bufferLen   = 0;
           UChar         source            [MAXBUFFERSIZE];
           UChar         target            [MAXBUFFERSIZE];
           UCollationResult result   = UCOL_EQUAL;
           uint8_t             sourceKeyArray    [MAXBUFFERSIZE];
           uint8_t             targetKeyArray    [MAXBUFFERSIZE];
           int32_t       sourceKeyOut      = 0,
                       targetKeyOut = 0;
           UCollator     *myCollator = 0;
           if (U_FAILURE(*status))
           {
               return FALSE;
           }
           u_uastrcpy(source, "This is a test.");
           u_uastrcpy(target, "THIS IS A TEST.");
           myCollator = ucol_open(locale, status);
           if (U_FAILURE(*status)){
               bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status);
               /*Report the error with display name... */
               fprintf(stderr,
               "Failed to create the collator for : \"%s\"\n", dispName);
               return FALSE;
           }
           result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
           /* result is 1, secondary differences only for ignorable space characters*/
           if (result != UCOL_LESS)
           {
               fprintf(stderr,
               "Comparing two strings with only secondary differences in C failed.\n");
               return FALSE;
           }
           /* To compare them with just primary differences */
           ucol_setStrength(myCollator, UCOL_PRIMARY);
           result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
           /* result is 0 */
           if (result != 0)
           {
               fprintf(stderr,
               "Comparing two strings with no differences in C failed.\n");
               return FALSE;
           }

           /* Now, do the same comparison with keys */
           sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE);
           targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE);
           result = 0;
           result = strcmp(sourceKeyArray, targetKeyArray);
           if (result != 0)
           {
               fprintf(stderr,
               "Comparing two strings with sort keys in C failed.\n");
               return FALSE;
           }
           ucol_close(myCollator);
           return TRUE;
       }

In C++:

       #include <stdio.h>
       #include "unicode/unistr.h"
       #include "unicode/utypes.h"
       #include "unicode/locid.h"
       #include "unicode/coll.h"
       #include "unicode/tblcoll.h"
       #include "unicode/coleitr.h"
       #include "unicode/sortkey.h"
       UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
       {
           UnicodeString dispName;
           UnicodeString source("This is a test.");
           UnicodeString target("THIS IS A TEST.");
           Collator::EComparisonResult result    = Collator::EQUAL;
           CollationKey sourceKey;
           CollationKey targetKey;
           Collator      *myCollator = 0;
           if (U_FAILURE(status))
           {
               return FALSE;
           }
           myCollator = Collator::createInstance(locale, status);
           if (U_FAILURE(status)){
               locale.getDisplayName(dispName);
               /*Report the error with display name... */
               fprintf(stderr,
               "%s: Failed to create the collator for : \"%s\"\n", dispName);
               return FALSE;
           }
           result = myCollator->compare(source, target);
           /* result is 1, secondary differences only for ignorable space characters*/
           if (result != UCOL_LESS)
           {
               fprintf(stderr,
               "Comparing two strings with only secondary differences in C failed.\n");
               return FALSE;
           }
           /* To compare them with just primary differences */
           myCollator->setStrength(Collator::PRIMARY);
           result = myCollator->compare(source, target);
           /* result is 0 */
           if (result != 0)
           {
               fprintf(stderr,
               "Comparing two strings with no differences in C failed.\n");
               return FALSE;
           }
           /* Now, do the same comparison with keys */
           myCollator->getCollationKey(source, sourceKey, status);
           myCollator->getCollationKey(target, targetKey, status);
           result = Collator::EQUAL;

           result = sourceKey.compareTo(targetKey);
           if (result != 0)
           {
               fprintf(stderr,
               "%s: Comparing two strings with sort keys in C failed.\n");
               return FALSE;
           }
           delete myCollator;
           return TRUE;
       }

Main Function

       extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
       int main()
       {
          UErrorCode status = U_ZERO_ERROR;
          fprintf(stdout, "\n");
          if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE)
          {
               fprintf(stderr,
               "Collate with locale in C++ failed.\n");
          } else
          {
              fprintf(stdout, "Collate with Locale C++ example worked!!\n");
          }
          status = U_ZERO_ERROR;
          fprintf(stdout, "\n");
          if (collateWithLocaleInC("en_US", &status) != TRUE)
          {
               fprintf(stderr,
               "%s: Collate with locale in C failed.\n");
          } else
          {
              fprintf(stdout, "Collate with Locale C example worked!!\n");
          }
          return 0;
       }

In Java:

       
            import com.ibm.icu.text.Collator;
            import com.ibm.icu.text.CollationElementIterator;
            import com.ibm.icu.text.CollationKey;
            import java.util.Locale;

            public class CollateExample
            {
           
                public static void main(String arg[])
                {
                    CollateExample example = new CollateExample();
                    try {
                        if (!example.collateWithLocale(Locale.US)) {
                            System.err.println("Collate with locale example failed.");
                        }
                        else {
                            System.out.println("Collate with Locale example worked!!");
                        }
                    } catch (Exception e) {
                        System.err.println("Collating with locale failed");
                        e.printStackTrace();
                    }
                }
       
                public boolean collateWithLocale(Locale locale) throws Exception
                {
                    String source = "This is a test.";
                    String target = "THIS IS A TEST.";
                    Collator myCollator = Collator.getInstance(locale);

                    int result = myCollator.compare(source, target);
                    // result is 1, secondary differences only for ignorable space characters
                    if (result >= 0) {
                        System.err.println(
                            "Comparing two strings with only secondary differences failed.");
                        return false;
                    }
                    // To compare them with just primary differences
                    myCollator.setStrength(Collator.PRIMARY);
                    result = myCollator.compare(source, target);
                    // result is 0
                    if (result != 0) {
                        System.err.println(
                                       "Comparing two strings with no differences failed.");
                        return false;
                    }
                    // Now, do the same comparison with keys
                    CollationKey sourceKey = myCollator.getCollationKey(source);
                    CollationKey targetKey = myCollator.getCollationKey(target);
                    result = sourceKey.compareTo(targetKey);
                    if (result != 0) {
                        System.err.println("Comparing two strings with sort keys failed.");
                        return false;
                    }
                    return true;
                }   
           }    
       

Language-sensitive searching

String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For full textual description of concept behind the sample programs, please see Laura Werner's text searching article for more details (http://icu-project.org/docs/papers/efficient_text_searching_in_java.html ).

The source of the language-sensitive text searching based on ICU Collation Service can be found on the Internet at http://source.icu-project.org/repos/icu/icu/trunk/source/i18n/usearch.cpp .

Using large buffers to manage sort keys

A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer.

Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields.

uint32_t 

fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, uint32_t sourceSize,
                            uint8_t **buffer, uint32_t *maxSize, UErrorCode *status)
{
  if(status == NULL || U_FAILURE(*status)) {
    return 0;
  }

  uint32_t bufferSize = 16384;
  uint32_t increment = 16384;
  uint32_t currentOffset = 0;
  uint32_t keySize = 0;
  uint32_t i = 0;
  *maxSize = 0;

  *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));
  if(buffer == NULL) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return 0;
  }

  for(i = 0; i < sourceSize; i++) {
    keys[i] = currentOffset;
    keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
    if(keySize > bufferSize-currentOffset) {
      *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment);
      if(buffer == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return 0;
      }
      bufferSize += increment;
      keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
    }
    /* here you can hook code that does something interesting with the keySize -
     * remembers the maximum or similar...
     */
    if(keySize > *maxSize) {
      *maxSize = keySize;
    }
    currentOffset += keySize;
  }

  return currentOffset;
}

Comments