// #include "../builtins/libunicode.h" #include #include #define ps(x) printf("%s = %s\n", #x, x); #define pi(x) printf("%s = %d\n", #x, x); #define pp(x) printf("%s = %p\n", #x, x); #include int getBit(unsigned char byte, int position) // position in range 0-7 { return (byte >> position) & 0x1; } struct bstate { unsigned char * data; size_t index; size_t bitcount; }; typedef struct bstate bstate; bstate * bopen(unsigned char * data) { bstate * n = malloc(sizeof(bstate)); n->data = data; n->index = 0; n->bitcount = 0; return n; } void bclose(bstate * n) { n->data = NULL; n->index = 0; n->bitcount = 0; free(n); n = NULL; } unsigned char * bread(bstate * n, int len, int * ii) { unsigned char * bits = malloc(len); int was_set = 0; for (int i = 0; i < len; i++) { bits[i]=!getBit(n->data[n->index], *ii)?'0':'1'; (*ii)--; n->bitcount++; if (n->bitcount>7) { n->index++; n->bitcount = 0; was_set = 1; *ii = 7; } } bits[len] = 0; was_set = 0; return bits; } int dts(int x) { int n = 0; while(x /= 2) n++; return n; } // should this use a bstate? unsigned char * bwrite(unsigned char * encoding) { // assume host byte size is CHAR_BIT if(strlen(encoding)%CHAR_BIT != 0) { printf("warning: bit length is not evenly divisible against a byte, the last byte may be corrupted\n"); } size_t len = strlen(encoding)/CHAR_BIT; unsigned char * s = malloc(len); memset(s, 0, len); int i; for( i = 0; i < strlen(encoding); i++) { s[i>>dts(CHAR_BIT)] |= ( encoding[i] == '1' ? 1 : 0 ) << ((CHAR_BIT-1)-(i)&(CHAR_BIT-1)); } s[len] = 0; return s; } struct unicode { int bytes; int codepoint_bits; } unicode; void detect(char * string) { ps(string); bstate * c = bopen(string); int i = 7; char * bits = NULL; bits = bread(c, 8, &i); if (bits[0] == '0') { unicode.bytes = 1; unicode.codepoint_bits = 7; } else if (bits[0] == '1') { if (bits[1] == '1') { if (bits[2] == '0') { unicode.bytes = 2; unicode.codepoint_bits = 11; } else if (bits[2] == '1') { if (bits[3] == '0') { unicode.bytes = 3; unicode.codepoint_bits = 16; } else if (bits[3] == '1') { if (bits[4] == '0') { unicode.bytes = 4; unicode.codepoint_bits = 21; } } } } } pi(unicode.bytes) pi(unicode.codepoint_bits) ps(bits); free(bits); bits = NULL; bits = bread(c, 8, &i); ps(bits); free(bits); bits = NULL; bits = bread(c, 8, &i); ps(bits); free(bits); bits = NULL; bclose(c); } void main(void) { detect("a"); detect("あ"); detect("€"); // int ii = 7, iii = 7; // puts("opening binary state"); // bstate * c = bopen("€"); // pp(c); // ps(c->data); // ps(bread(c, 8, &iii)); // ps(bread(c, 8, &iii)); // ps(bread(c, 8, &iii)); // bclose(c); // bstate * b = bopen("€"); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // ps(bread(b, 1, &ii)); // bclose(b); // int i = 7; // bstate * d___ = bopen("€"); // char * d__ = bread(d___, 8*3, &i); // ps(d__); // char * d = bwrite(d__); // bclose(d__); // ps(d); // i = 7; // pp(d); // bstate * d____ = bopen(d); // char * d_ = bread(d____, 8*3, &i); // pp(d); // ps(d_); // free(d); // free(d_); // bclose(d____); }