///
<summary>
///
取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符,Encoding.Default将被返回。
///
文件的字符集在Windows下有两种,一种是ANSI,一种Unicode。
///
对于Unicode,Windows支持了它的三种编码方式,一种是小尾编码(Unicode),一种是大尾编码(BigEndianUnicode),一种是UTF-8编码。
///
我们可以从文件的头部来区分一个文件是属于哪种编码。当头部开始的两个字节为 FF FE时,是Unicode的小尾编码;当头部的两个字节为FE FF时,是Unicode的大尾编码;当头部两个字节为EF BB时,是Unicode的UTF-8编码;当它不为这些时,则是ANSI编码。
///
按照如上所说,我们可以通过读取文件头的两个字节来判断文件的编码格式
///
</summary>
///
<param name="filename">
文件名。
</param>
///
<returns></returns>
public
static
System.Text.Encoding GetFileEncoding(
this
string
filename)
if
(!
File.Exists(filename))
throw
new
Exception(
"
文件"
"
+ filename +
"
"不存在!
"
);
using
(
var
fs =
new
System.IO.FileStream(filename, System.IO.FileMode.Open, System.IO.FileAccess.Read))
using
(
var
br =
new
System.IO.BinaryReader(fs))
var
buffer = br.ReadBytes(
2
);
if
(buffer[
0
] >=
0xEF
)
if
(buffer[
0
] ==
0xEF
&& buffer[
1
] ==
0xBB
)
return
System.Text.Encoding.UTF8;
if
(buffer[
0
] ==
0xFE
&& buffer[
1
] ==
0xFF
)
return
System.Text.Encoding.BigEndianUnicode;
if
(buffer[
0
] ==
0xFF
&& buffer[
1
] ==
0xFE
)
return
System.Text.Encoding.Unicode;
return
GetEncodingWithBomUtf8(fs, System.Text.Encoding.Default);
///
<summary>
///
通过给定的文件流,判断文件的编码类型 (解决了不带BOM的 UTF8 编码问题 )
///
</summary>
///
<param name="fs">
文件流
</param>
///
<param name="defaultEncoding">
默认编码
</param>
///
<returns>
文件的编码类型
</returns>
private
static
System.Text.Encoding GetEncodingWithBomUtf8(Stream fs, Encoding defaultEncoding)
byte
[] unicode =
new
byte
[] {
0xFF
,
0xFE
,
0x41
};
byte
[] unicodeBig =
new
byte
[] {
0xFE
,
0xFF
,
0x00
};
//
带BOM
byte
[] utf8 =
new
byte
[] {
0xEF
,
0xBB
,
0xBF
};
var
reVal =
defaultEncoding;
using
(
var
r =
new
System.IO.BinaryReader(fs))
byte
[] ss = r.ReadBytes(
4
);
if
(ss[
0
] ==
0xFE
&& ss[
1
] ==
0xFF
&& ss[
2
] ==
0x00
)
reVal
=
Encoding.BigEndianUnicode;
else
if
(ss[
0
] ==
0xFF
&& ss[
1
] ==
0xFE
&& ss[
2
] ==
0x41
)
reVal
=
Encoding.Unicode;
if
(ss[
0
] ==
0xEF
&& ss[
1
] ==
0xBB
&& ss[
2
] ==
0xBF
)
reVal
=
Encoding.UTF8;
int
i;
int
.TryParse(fs.Length.ToString(),
out
i);
ss
=
r.ReadBytes(i);
if
(IsUtf8Bytes(ss))
reVal
=
Encoding.UTF8;
return
reVal;
///
<summary>
///
判断是否是不带 BOM 的 UTF8 格式
///
</summary>
///
<param name="data"></param>
///
<returns></returns>
private
static
bool
IsUtf8Bytes(
byte
[] data)
int
charByteCounter =
1
;
//
计算当前正分析的字符应还有的字节数
for
(
int
i =
0
; i < data.Length; i++
)
var
curByte = data[i];
//
当前分析的字节.
if
(charByteCounter ==
1
)
if
(curByte >=
0x80
)
//
判断当前
while
(((curByte <<=
1
) &
0x80
) !=
0
)
charByteCounter
++
;
//
标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X
if
(charByteCounter ==
1
|| charByteCounter >
6
)
return
false
;
//
若是UTF-8 此时第一位必须为1
if
((curByte &
0xC0
) !=
0x80
)
return
false
;
charByteCounter
--
;
if
(charByteCounter >
1
)
throw
new
Exception(
"
非预期的byte格式!
"
);
return
true
;
java版本
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.util.BitSet;
public class EncodeUtils {
private static final Logger logger = LoggerFactory.getLogger(EncodeUtils.class);
private static final int BYTE_SIZE = 8;
private static final String CODE_UTF8 = "UTF-8";
private static final String CODE_UTF16 = "UTF-16";//Unicode
private static final String CODE_UTF16LE = "UTF-16LE";//Unicode big endian
private static final String CODE_GBK = "GBK"; //ABSU
* 通过文件全名称获取编码集名称
public static String getEncode(String fullFileName) throws Exception {
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));
return getEncode(bis, CODE_GBK);
* 通过文件全名称获取编码集名称
public static String getEncode(String fullFileName, String defaultEncoding) throws Exception {
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));
return getEncode(bis, defaultEncoding);
* 通过文件缓存流获取编码集名称,文件流必须为未曾
* @param bis 文件流
public static String getEncode(BufferedInputStream bis, String defaultEncoding) throws Exception {
bis.mark(0);
String encodeType;
byte[] head = new byte[3];
bis.read(head);
if (head[0] == -1 && head[1] == -2 && head[2] == (byte) 0x41) {
encodeType = CODE_UTF16;
} else if (head[0] == -2 && head[1] == -1 && head[2] == 0) {
//encodeType = "Unicode";
encodeType = CODE_UTF16LE;
} else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {
//带BOM的UTF8 (CODE_UTF8_BOM)
encodeType = CODE_UTF8;
} else {
if (isUTF8(bis)) {
encodeType = CODE_UTF8;
} else {
encodeType = defaultEncoding;
return encodeType;
* 是否是无BOM的UTF8格式,不判断常规场景,只区分无BOM UTF8和GBK
private static boolean isUTF8(BufferedInputStream bis) throws Exception {
bis.reset();
//读取第一个字节
int code = bis.read();
BitSet bitSet = convert2BitSet(code);
//判断是否为单字节
if (bitSet.get(0)) {//多字节时,再读取N个字节
if (!checkMultiByte(bis, bitSet)) {//未检测通过,直接返回
return false;
code = bis.read();
} while (code != -1);
return true;
* 检测多字节,判断是否为utf8,已经读取了一个字节
private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
int count = getCountOfSequential(bitSet);
byte[] bytes = new byte[count - 1];//已经读取了一个字节,不能再读取
bis.read(bytes);
for (byte b : bytes) {
if (!checkUtf8Byte(b)) {
return false;
return true;
* 检测bitSet中从开始有多少个连续的1
private static int getCountOfSequential(BitSet bitSet) {
int count = 0;
for (int i = 0; i < BYTE_SIZE; i++) {
if (bitSet.get(i)) {
count++;
} else {
break;
return count;
* 检测单字节,判断是否为utf8
private static boolean checkUtf8Byte(byte b) throws Exception {
BitSet bitSet = convert2BitSet(b);
return bitSet.get(0) && !bitSet.get(1);
* 将整形转为BitSet
private static BitSet convert2BitSet(int code) {
BitSet bitSet = new BitSet(BYTE_SIZE);
for (int i = 0; i < BYTE_SIZE; i++) {
int tmp3 = code >> (BYTE_SIZE - i - 1);
int tmp2 = 0x1 & tmp3;
if (tmp2 == 1) {
bitSet.set(i);
return bitSet;
public static void main(String[] args) {
String filePath = "C:\\110025.txt";
try {
String encoding = getEncode(filePath);
System.out.println(encoding);
} catch (Exception ex) {
logger.warn("文件检测编码出错!", ex);