维护的项目代码有很多编码不是utf-8的,导致保存会把中文注释变成乱码
不止我一个人遇到这个问题,项目中有很多已经是乱码文件是别人导致的
我研究了一下,把文件用utf-8或者gb2312打开,再转换成gbk繁体,如果转换失败,那基本就是乱码
代码如下
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 
 | import os
 def check_is_encode_error(string):
 try:
 string.encode('gbk')
 except UnicodeEncodeError:
 return True
 return False
 
 def check_file(filepath):
 try:
 with open(filepath, "r", encoding="utf-8") as file:
 lines = file.readlines()
 except UnicodeDecodeError:
 with open(filepath, "r", encoding="gb2312") as file:
 lines = file.readlines()
 
 for num, line in enumerate(lines, 1):
 if check_is_encode_error(line):
 print("文件存在乱码 {}:{}".format(filepath, num))
 
 def check_dir(dir):
 for root, dirs, files in os.walk(dir):
 for file in files:
 filepath = os.path.join(root, file)
 try:
 check_file(filepath)
 except:
 pass
 print("文件 {} 检查失败".format(filepath))
 
 
 check_dir("path_to_dir")
 
 | 
在根据git blame翻commit历史,替换掉所有乱码注释以后,再把所有不是utf-8的文件替换成utf-8格式
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 
 | import osimport chardet
 import codecs
 
 def convert_to_utf8(src_path, src_encoding):
 with codecs.open(src_path, 'r', src_encoding) as src_file:
 content = src_file.read()
 
 with codecs.open(src_path, 'w', 'utf-8') as dst_file:
 dst_file.write(content)
 
 def detect_file_encoding(file_path):
 with open(file_path, 'rb') as f:
 rawdata = f.read()
 return chardet.detect(rawdata).get('encoding')
 
 def check_dir(dir_path):
 for root, dirs, files in os.walk(dir_path):
 for file in files:
 if file.endswith(('.h', '.cpp', '.jce')):
 file_path = os.path.join(root, file)
 encoding = detect_file_encoding(file_path)
 if encoding != 'ascii' and encoding != 'utf-8':
 print("File: {} | Encoding: {}".format(file_path, encoding))
 print("Converting file: {} | Encoding: {}".format(file_path, encoding))
 convert_to_utf8(file_path, encoding)
 
 
 check_dir("path_to_dir")
 
 | 
收工!