Administrator
Published on 2024-05-13 / 7 Visits
0
0

C语言把usc2十六进制字符串转为utf-8

代码:
usc2.c

#include <stdio.h>  
#include <stdlib.h>  
#include <string.h>  
#include <stdint.h>  
  
void ucs2_to_utf8(uint16_t ucs2_char, char *utf8_str) {  
    if (ucs2_char < 0x80) {  
        // 1-byte sequence: 0xxxxxxx  
        utf8_str[0] = (char)ucs2_char;  
        utf8_str[1] = '\0';  
    } else if (ucs2_char < 0x800) {  
        // 2-byte sequence: 110xxxxx 10xxxxxx  
        utf8_str[0] = (char)(0xC0 | ((ucs2_char >> 6) & 0x1F));  
        utf8_str[1] = (char)(0x80 | (ucs2_char & 0x3F));  
        utf8_str[2] = '\0';  
    } else if (ucs2_char < 0x10000) {  
        // 虽然UCS-2只支持到0xFFFF,但UTF-8的3-byte序列仍然适用  
        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx  
        utf8_str[0] = (char)(0xE0 | ((ucs2_char >> 12) & 0x0F));  
        utf8_str[1] = (char)(0x80 | ((ucs2_char >> 6) & 0x3F));  
        utf8_str[2] = (char)(0x80 | (ucs2_char & 0x3F));  
        utf8_str[3] = '\0';  
    } else {  
        // 对于UCS-2,这个分支实际上是不会触发的,因为它只支持到0xFFFF  
        // 但为了完整性,这里仍然保留  
        utf8_str[0] = '?'; // 或者其他错误指示符  
        utf8_str[1] = '\0';  
    }  
}  
  
int main(int argc, char *argv[]) {  
    if (argc != 2) {  
        fprintf(stderr, "Usage: %s <hex_string>\n", argv[0]);  
        return 1;  
    }  
  
    char *hex_str = argv[1];  
    size_t hex_str_len = strlen(hex_str);  
  
    // 确保hex_str的长度是4的倍数(每个UCS-2字符占用4个十六进制数字)  
    if (hex_str_len % 4 != 0) {  
        fprintf(stderr, "Error: Invalid hex string length (must be a multiple of 4).\n");  
        return 1;  
    }  

    char *result = NULL;  
    size_t result_len = 0;  
    size_t capacity = 0;  
  
    // 遍历hex_str,每4个字符转换为一个UCS-2字符,并转换为UTF-8  
    for (size_t i = 0; i < hex_str_len; i += 4) {  
        char hex_substr[5]; // 4个十六进制数字加上终止符  
        strncpy(hex_substr, hex_str + i, 4);  
        hex_substr[4] = '\0'; // 确保字符串正确终止  
  
        // 将hex_substr转换为uint16_t  
        uint16_t ucs2_char = 0;  
        sscanf(hex_substr, "%hx", &ucs2_char);  
  
        // 准备存储UTF-8字符串的数组(足够大以存储3字节的UTF-8字符和终止符)  
        char utf8_str[4];  
  
        // 调用转换函数  
        ucs2_to_utf8(ucs2_char, utf8_str);  
  
        // 输出结果  
        //printf("UCS-2 (hex): %s, UTF-8: %s\n", hex_substr, utf8_str);  

        size_t new_len = result_len + strlen(utf8_str); // 假设utf8_str是一个以null结尾的字符串  
        if (new_len > capacity) {  
            capacity = new_len + 10; // 加上一些额外的空间以防止频繁重新分配  
            result = realloc(result, capacity);  
            if (!result) {  
                perror("realloc failed");  
                exit(EXIT_FAILURE);  
            }  
        }  
  
        // 将新的字符追加到结果字符串中  
        strcpy(result + result_len, utf8_str);  
        result_len = new_len;  
    }  
    
    // 打印结果字符串  
    printf("%s\n", result);  
  
    // 释放分配的内存  
    free(result);  
    
    return 0;  
}

编译:

gcc -o usc2 usc2.c

使用:

chmod +x usc2
./usc2 0072003300306B228FCE4F60676552305E7F5DDE8F897FA4667A80FD79D1628067099650516C53F8

Comment