检查文件乱码和转换格式

维护的项目代码有很多编码不是utf-8的,导致保存会把中文注释变成乱码

不止我一个人遇到这个问题,项目中有很多已经是乱码文件是别人导致的

我研究了一下,把文件用utf-8或者gb2312打开,再转换成gbk繁体,如果转换失败,那基本就是乱码

代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os

def check_is_encode_error(string):
try:
string.encode('gbk')
except UnicodeEncodeError:
return True
return False

def check_file(filepath):
try:
with open(filepath, "r", encoding="utf-8") as file:
lines = file.readlines()
except UnicodeDecodeError:
with open(filepath, "r", encoding="gb2312") as file:
lines = file.readlines()

for num, line in enumerate(lines, 1):
if check_is_encode_error(line):
print("文件存在乱码 {}:{}".format(filepath, num))

def check_dir(dir):
for root, dirs, files in os.walk(dir):
for file in files:
filepath = os.path.join(root, file)
try:
check_file(filepath)
except:
pass
print("文件 {} 检查失败".format(filepath))

# 把"path_to_dir"替换成你要检查的文件夹
check_dir("path_to_dir")

在根据git blame翻commit历史,替换掉所有乱码注释以后,再把所有不是utf-8的文件替换成utf-8格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import chardet
import codecs

def convert_to_utf8(src_path, src_encoding):
with codecs.open(src_path, 'r', src_encoding) as src_file:
content = src_file.read()

with codecs.open(src_path, 'w', 'utf-8') as dst_file:
dst_file.write(content)

def detect_file_encoding(file_path):
with open(file_path, 'rb') as f:
rawdata = f.read()
return chardet.detect(rawdata).get('encoding')

def check_dir(dir_path):
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(('.h', '.cpp', '.jce')):
file_path = os.path.join(root, file)
encoding = detect_file_encoding(file_path)
if encoding != 'ascii' and encoding != 'utf-8':
print("File: {} | Encoding: {}".format(file_path, encoding))
print("Converting file: {} | Encoding: {}".format(file_path, encoding))
convert_to_utf8(file_path, encoding)

# 把"path_to_dir"替换成你要检查的文件夹
check_dir("path_to_dir")

收工!