按指定编码格式读取文本文件并返回内容
本帖最后由 小菜123 于 2022-5-22 16:34 编辑解决UTF-8等文本文件读入乱码问题
;;;by小菜 测试 (read_txt_file_as_encoding "f:\\test.txt" "utf-8")
(defun read_txt_file_as_encoding (FileName encoding / bstr stream byxiaocai)
(setq stream (vlax-create-object "ADODB.Stream"))
(vlax-put-Property stream 'type 2)
(Vlax-Put-Property stream 'Charset encoding)
(Vlax-Invoke stream 'open)
(vlax-invoke-method stream 'LoadFromFile FileName)
(setq bstr (Vlax-Invoke-Method stream 'readtext -1))
(Vlax-Invoke-Method stream 'close)
(vlax-release-object stream)
bstr
)
本帖最后由 小菜123 于 2023-4-11 20:12 编辑
另一个帖子也讨论到这个问题,正好完善一下以前的函数
http://bbs.mjtd.com/thread-187537-1-1.html
(defun detect_encoding (file / encoding stm_obj str)
(setq stm_obj (vlax-create-object "Adodb.Stream"))
(vlax-put-property stm_obj 'type 1)
(vlax-put-property stm_obj 'mode 3)
(vlax-invoke-method stm_obj 'open nil nil nil nil nil)
(vlax-put-property stm_obj 'position 0)
(vlax-invoke-method stm_obj 'loadfromfile file)
(setq str (vlax-invoke-method stm_obj 'read nil))
(vlax-release-object stm_obj)
(setq str (vlax-safearray->list (vlax-variant-value str)))
(cond((and (= (car str) 239) (= 187 (cadr str)) (= 191 (caddr str)));;根据文件头判断,0xEF 0xBB 0xBF为UTF-8
(setq encoding "UTF-8")
)
((and (= (car str) 255) (= 254 (cadr str))) ;;0xFF 0xFE 为Unicode (LE)
(setq encoding "Unicode") ;;"UTF-16LE"
)
((and (= (car str) 254) (= 255 (cadr str)));;0xFE 0xFF 为Unicode BE
(setq encoding "Unicode");;utf-16be
)
((Is_UTF8_No_BOM str) (setq encoding "UTF-8"));;是否为没有BOM的UTF-8
(t (setq encoding "GB2312"))
)
encoding
)
(defun Is_UTF8_No_BOM (str / is_UTF8 loop)
(setq is_UTF8 t
loop t
)
(while (and str loop)
(cond ((< (car str) 128) (setq str (cdr str)));; 小于0x80为ACSCII字符
((< (car str) 192) (setq is_UTF8 nil loop nil));;(11000000) 介于0x80和0xC0之间的为无效的UTF-8字符
((< (car str) 224);;此范围为2字节UTF-8字符
(if (<= (length str) 1)
(setq is_UTF8 nil loop nil)
(if (/= (logand (cadr str) 192) 128) ;;_else(str & 0xC0 != 0x80)
(setq is_UTF8 nil loop nil) ;;不符合
(setq str (cddr str));;_else 继续
)
);;if
)
((< (car str) 240);;(11110000) 此范围为3字节UTF-8字符
(if (<= (length str) 2)
(setq is_UTF8 nil loop nil)
(if (or (/= (logand (cadr str) 192) 128) (/= (logand (caddr str) 192) 128)) ;;_else(str & 0xC0 != 0x80 || str & 0xC0 != 0x80)
(setq is_UTF8 nil loop nil) ;;不符合
(setq str (cdddr str));;_else 继续
)
)
)
(t (setq is_UTF8 nil loop nil))
)
)
is_UTF8
)
谢谢菜总分享 能否先自动判断编码方式,然后再决定是否采用呢? Bao_lai 发表于 2022-5-22 15:46
能否先自动判断编码方式,然后再决定是否采用呢?
理论上可以,但对于没有BOM标志的文件难度太大 小菜123 发表于 2022-5-22 16:36
理论上可以,但对于没有BOM标志的文件难度太大
http://bbs.mjtd.com/forum.php?mod=viewthread&tid=82886&highlight=%B1%E0%C2%EB 求CAD属性块文字递增刷以及文字刷 编码格式未知时用下面方式读取即可自动判断:
(read_txt_file_as_encoding FileName (detect_encoding FileName )) 小菜123 发表于 2023-4-10 20:50
编码格式未知时用下面方式读取即可自动判断:
(read_txt_file_as_encoding FileName (detect_encoding File ...
测试了一下,(detect_encoding "D:\\图纸目录.txt")返回nil,不知何故,就是普通的txt文件 刚好最近遇到了这个问题,学习学习
页:
[1]