factor out CP1252 map

This commit is contained in:
Loic Nageleisen 2015-06-30 11:46:57 +02:00
parent f9abd8a0bb
commit 7c49eceeba

View file

@ -32,6 +32,38 @@ func Assume(e Encoding) func(*Fixer) error {
} }
} }
// remainder is ISO-8859-1
// does not define 0x81, 0x8D, 0x8F, 0x90, 09D
var cp1252 = map[byte][]byte{
0x80: {0xE2, 0x82, 0xAC}, // EURO SIGN
0x82: {0xE2, 0x80, 0x9A}, // SINGLE LOW-9 QUOTATION MARK
0x83: {0xC6, 0x92}, // LATIN SMALL LETTER F WITH HOOK
0x84: {0xE2, 0x80, 0x9E}, // DOUBLE LOW-9 QUOTATION MARK
0x85: {0xE2, 0x80, 0xA6}, // HORIZONTAL ELLIPSIS
0x86: {0xE2, 0x80, 0xA0}, // DAGGER
0x87: {0xE2, 0x80, 0xA1}, // DOUBLE DAGGER
0x88: {0xCB, 0x86}, // MODIFIER LETTER CIRCUMFLEX ACCENT
0x89: {0xE2, 0x80, 0xB0}, // PER MILLE SIGN
0x8A: {0xC5, 0xA0}, // LATIN CAPITAL LETTER S WITH CARON
0x8B: {0xE2, 0x80, 0xB9}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8C: {0xC5, 0x92}, // LATIN CAPITAL LIGATURE OE
0x8E: {0xC5, 0xBD}, // LATIN CAPITAL LETTER Z WITH CARON
0x91: {0xE2, 0x80, 0x98}, // LEFT SINGLE QUOTATION MARK
0x92: {0xE2, 0x80, 0x99}, // RIGHT SINGLE QUOTATION MARK
0x93: {0xE2, 0x80, 0x9C}, // LEFT DOUBLE QUOTATION MARK
0x94: {0xE2, 0x80, 0x9D}, // RIGHT DOUBLE QUOTATION MARK
0x95: {0xE2, 0x80, 0xA2}, // BULLET
0x96: {0xE2, 0x80, 0x93}, // EN DASH
0x97: {0xE2, 0x80, 0x94}, // EM DASH
0x98: {0xCB, 0x9C}, // SMALL TILDE
0x99: {0xE2, 0x84, 0xA2}, // TRADE MARK SIGN
0x9A: {0xC5, 0xA1}, // LATIN SMALL LETTER S WITH CARON
0x9B: {0xE2, 0x80, 0xBA}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9C: {0xC5, 0x93}, // LATIN SMALL LIGATURE OE
0x9E: {0xC5, 0xBE}, // LATIN SMALL LETTER Z WITH CARON
0x9F: {0xC5, 0xB8}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
}
func Fix(r io.Reader, w io.Writer, options ...func(*Fixer) error) { func Fix(r io.Reader, w io.Writer, options ...func(*Fixer) error) {
f := &Fixer{} f := &Fixer{}
@ -108,36 +140,6 @@ func Fix(r io.Reader, w io.Writer, options ...func(*Fixer) error) {
// CP1252 // CP1252
if handle_cp1252 { if handle_cp1252 {
// does not define 0x81, 0x8D, 0x8F, 0x90, 09D
cp1252 := map[byte][]byte{
0x80: {0xE2, 0x82, 0xAC}, // EURO SIGN
0x82: {0xE2, 0x80, 0x9A}, // SINGLE LOW-9 QUOTATION MARK
0x83: {0xC6, 0x92}, // LATIN SMALL LETTER F WITH HOOK
0x84: {0xE2, 0x80, 0x9E}, // DOUBLE LOW-9 QUOTATION MARK
0x85: {0xE2, 0x80, 0xA6}, // HORIZONTAL ELLIPSIS
0x86: {0xE2, 0x80, 0xA0}, // DAGGER
0x87: {0xE2, 0x80, 0xA1}, // DOUBLE DAGGER
0x88: {0xCB, 0x86}, // MODIFIER LETTER CIRCUMFLEX ACCENT
0x89: {0xE2, 0x80, 0xB0}, // PER MILLE SIGN
0x8A: {0xC5, 0xA0}, // LATIN CAPITAL LETTER S WITH CARON
0x8B: {0xE2, 0x80, 0xB9}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8C: {0xC5, 0x92}, // LATIN CAPITAL LIGATURE OE
0x8E: {0xC5, 0xBD}, // LATIN CAPITAL LETTER Z WITH CARON
0x91: {0xE2, 0x80, 0x98}, // LEFT SINGLE QUOTATION MARK
0x92: {0xE2, 0x80, 0x99}, // RIGHT SINGLE QUOTATION MARK
0x93: {0xE2, 0x80, 0x9C}, // LEFT DOUBLE QUOTATION MARK
0x94: {0xE2, 0x80, 0x9D}, // RIGHT DOUBLE QUOTATION MARK
0x95: {0xE2, 0x80, 0xA2}, // BULLET
0x96: {0xE2, 0x80, 0x93}, // EN DASH
0x97: {0xE2, 0x80, 0x94}, // EM DASH
0x98: {0xCB, 0x9C}, // SMALL TILDE
0x99: {0xE2, 0x84, 0xA2}, // TRADE MARK SIGN
0x9A: {0xC5, 0xA1}, // LATIN SMALL LETTER S WITH CARON
0x9B: {0xE2, 0x80, 0xBA}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9C: {0xC5, 0x93}, // LATIN SMALL LIGATURE OE
0x9E: {0xC5, 0xBE}, // LATIN SMALL LETTER Z WITH CARON
0x9F: {0xC5, 0xB8}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
}
if bytes, ok := cp1252[input[0]]; ok { if bytes, ok := cp1252[input[0]]; ok {
for _, b := range bytes { for _, b := range bytes {
output = append(output, b) output = append(output, b)