185 lines
7.3 KiB
C
185 lines
7.3 KiB
C
|
/* Copyright 2013 Google Inc. All Rights Reserved.
|
||
|
|
||
|
Distributed under MIT license.
|
||
|
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
||
|
*/
|
||
|
|
||
|
/* Functions to map previous bytes into a context id. */
|
||
|
|
||
|
#ifndef BROTLI_ENC_CONTEXT_H_
|
||
|
#define BROTLI_ENC_CONTEXT_H_
|
||
|
|
||
|
#include <brotli/port.h>
|
||
|
#include <brotli/types.h>
|
||
|
|
||
|
#if defined(__cplusplus) || defined(c_plusplus)
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
/* Second-order context lookup table for UTF8 byte streams.
|
||
|
|
||
|
If p1 and p2 are the previous two bytes, we calculate the context as
|
||
|
|
||
|
context = kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256].
|
||
|
|
||
|
If the previous two bytes are ASCII characters (i.e. < 128), this will be
|
||
|
equivalent to
|
||
|
|
||
|
context = 4 * context1(p1) + context2(p2),
|
||
|
|
||
|
where context1 is based on the previous byte in the following way:
|
||
|
|
||
|
0 : non-ASCII control
|
||
|
1 : \t, \n, \r
|
||
|
2 : space
|
||
|
3 : other punctuation
|
||
|
4 : " '
|
||
|
5 : %
|
||
|
6 : ( < [ {
|
||
|
7 : ) > ] }
|
||
|
8 : , ; :
|
||
|
9 : .
|
||
|
10 : =
|
||
|
11 : number
|
||
|
12 : upper-case vowel
|
||
|
13 : upper-case consonant
|
||
|
14 : lower-case vowel
|
||
|
15 : lower-case consonant
|
||
|
|
||
|
and context2 is based on the second last byte:
|
||
|
|
||
|
0 : control, space
|
||
|
1 : punctuation
|
||
|
2 : upper-case letter, number
|
||
|
3 : lower-case letter
|
||
|
|
||
|
If the last byte is ASCII, and the second last byte is not (in a valid UTF8
|
||
|
stream it will be a continuation byte, value between 128 and 191), the
|
||
|
context is the same as if the second last byte was an ASCII control or space.
|
||
|
|
||
|
If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
|
||
|
be a continuation byte and the context id is 2 or 3 depending on the LSB of
|
||
|
the last byte and to a lesser extent on the second last byte if it is ASCII.
|
||
|
|
||
|
If the last byte is a UTF8 continuation byte, the second last byte can be:
|
||
|
- continuation byte: the next byte is probably ASCII or lead byte (assuming
|
||
|
4-byte UTF8 characters are rare) and the context id is 0 or 1.
|
||
|
- lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
|
||
|
- lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
|
||
|
|
||
|
The possible value combinations of the previous two bytes, the range of
|
||
|
context ids and the type of the next byte is summarized in the table below:
|
||
|
|
||
|
|--------\-----------------------------------------------------------------|
|
||
|
| \ Last byte |
|
||
|
| Second \---------------------------------------------------------------|
|
||
|
| last byte \ ASCII | cont. byte | lead byte |
|
||
|
| \ (0-127) | (128-191) | (192-) |
|
||
|
|=============|===================|=====================|==================|
|
||
|
| ASCII | next: ASCII/lead | not valid | next: cont. |
|
||
|
| (0-127) | context: 4 - 63 | | context: 2 - 3 |
|
||
|
|-------------|-------------------|---------------------|------------------|
|
||
|
| cont. byte | next: ASCII/lead | next: ASCII/lead | next: cont. |
|
||
|
| (128-191) | context: 4 - 63 | context: 0 - 1 | context: 2 - 3 |
|
||
|
|-------------|-------------------|---------------------|------------------|
|
||
|
| lead byte | not valid | next: ASCII/lead | not valid |
|
||
|
| (192-207) | | context: 0 - 1 | |
|
||
|
|-------------|-------------------|---------------------|------------------|
|
||
|
| lead byte | not valid | next: cont. | not valid |
|
||
|
| (208-) | | context: 2 - 3 | |
|
||
|
|-------------|-------------------|---------------------|------------------|
|
||
|
*/
|
||
|
static const uint8_t kUTF8ContextLookup[512] = {
|
||
|
/* Last byte. */
|
||
|
/* */
|
||
|
/* ASCII range. */
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
|
||
|
44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
|
||
|
12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
|
||
|
52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
|
||
|
12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
|
||
|
60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0,
|
||
|
/* UTF8 continuation byte range. */
|
||
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||
|
/* UTF8 lead byte range. */
|
||
|
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
||
|
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
||
|
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
||
|
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
||
|
/* Second last byte. */
|
||
|
/* */
|
||
|
/* ASCII range. */
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
|
||
|
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
|
||
|
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
|
||
|
/* UTF8 continuation byte range. */
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
/* UTF8 lead byte range. */
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
};
|
||
|
|
||
|
/* Context lookup table for small signed integers. */
|
||
|
static const uint8_t kSigned3BitContextLookup[] = {
|
||
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
|
||
|
};
|
||
|
|
||
|
typedef enum ContextType {
|
||
|
CONTEXT_LSB6 = 0,
|
||
|
CONTEXT_MSB6 = 1,
|
||
|
CONTEXT_UTF8 = 2,
|
||
|
CONTEXT_SIGNED = 3
|
||
|
} ContextType;
|
||
|
|
||
|
static BROTLI_INLINE uint8_t Context(uint8_t p1, uint8_t p2, ContextType mode) {
|
||
|
switch (mode) {
|
||
|
case CONTEXT_LSB6:
|
||
|
return p1 & 0x3f;
|
||
|
case CONTEXT_MSB6:
|
||
|
return (uint8_t)(p1 >> 2);
|
||
|
case CONTEXT_UTF8:
|
||
|
return kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256];
|
||
|
case CONTEXT_SIGNED:
|
||
|
return (uint8_t)((kSigned3BitContextLookup[p1] << 3) +
|
||
|
kSigned3BitContextLookup[p2]);
|
||
|
default:
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#if defined(__cplusplus) || defined(c_plusplus)
|
||
|
} /* extern "C" */
|
||
|
#endif
|
||
|
|
||
|
#endif /* BROTLI_ENC_CONTEXT_H_ */
|