起初想用 unoconv 将各类文档转成 pdf,txt 确实是一头包。这个方法基本可以将 txt 4 种格式都统一起来。 注意 linux 下需设置
LANG C.UTF-8
package until
import (
"unicode/utf16"
"bytes"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"io/ioutil"
)
func Utf16toString(b []uint8) (string) {
if len(b)&1 != 0 {
return string(b)
}
var bom int
if len(b) >= 2 {
switch n := int(b[0])<<8 | int(b[1]); n {
case 0xfffe:
bom = 1
fallthrough
case 0xfeff:
b = b[2:]
w := make([]uint16, len(b)/2)
for i := range w {
w[i] = uint16(b[2*i+bom&1])<<8 | uint16(b[2*i+(bom+1)&1])
}
return string(utf16.Decode(w))
default:
gbk, _ := GbkToUtf8(b)
return string(gbk)
}
} else {
return string(b)
}
}
func GbkToUtf8(s []byte) ([]byte, error) {
reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewDecoder())
d, e := ioutil.ReadAll(reader)
if e != nil {
return nil, e
}
return d, nil
}
func Utf16toString(b []uint8) (string) {
var bom int
if len(b) >= 2 {
switch n := int(b[0])<<8 | int(b[1]); n {
case 0xfffe:
bom = 1
fallthrough
case 0xfeff:
b = b[2:]
w := make([]uint16, len(b)/2)
for i := range w {
w[i] = uint16(b[2*i+bom&1])<<8 | uint16(b[2*i+(bom+1)&1])
}
return string(utf16.Decode(w))
case 0x564d:
gbk, _ := GbkToUtf8(b)
return string(gbk)
default:
return string(b)
}
} else {
return string(b)
}
}