0. 前言
最近突然想知道自己总共写了多少行代码,于是做了这样一个小工具……
1. 准备工作
先考虑一下希望得到的效果:
Language(语言) | Lines(代码行数) | Size(代码文件总大小) | Files(代码文件总数) |
---|---|---|---|
A | 12345 | 300 KB | 193 |
B | 2345 | 165 KB | 98 |
如上,程序输出一个表格,将代码行数作为关键字排序。 | |||
代码框架: |
# -*- encoding: utf-8 -*-
import ...# 代码行数计数类
class CodeLinesCounter(object):SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]def __init__(self, languages):self._languages = languages # 语言(dict,{文件后缀名:语言})self._codelines = {suffix: (0, 0, 0) for suffix in languages} # 统计结果,{后缀名:(行数,大小,文件数)}self._successful = self._error = 0 # 记录成功、失败文件个数# @param directory: 要扫描的目录# @param log: 是否打印日志def scan(self, directory, log=False):if log: print('Scanning', directory)passdef report(self): # 输出结果passcounter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'}) # 创建CodeLinesCounter实例
counter.scan('E:/') # 扫描E盘(注意不能用'E:')
counter.report() # 输出结果
完成,下面正式进入主要部分
2. 统计
2.1 文件扫描
首先,我们需要获取根目录下的文件列表。这可以用os.walk
实现:
os.walk(rootdir)
返回一个游走器(可迭代),包含根目录下每个子目录的文件及目录列表。我们来看一个例子:
有一文件夹Folder
如下:
Folder
| file1
| Folder1
| file2
| file3
| Folder2| file4| Folder3
运行如下代码:
import osfor root, dirs, files in os.walk('Folder'):print(root, dirs, files)
则输出如下:
Folder ['Folder1', 'Folder2'] ['file1']
Folder\Folder1 [] ['file2', 'file3']
Folder\Folder2 ['Folder3'] ['file4']
Folder\Folder2\Folder3 [] []
其中第一项是当前的根目录,第二项为目录下的目录列表,第三项则为当前的文件列表。
因此,我们可以编写如下代码:
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walkclass CodeLinesCounter(object):SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]def __init__(self, languages):self._languages = languagesself._results = {suffix: (0, 0, 0) for suffix in languages}self._successful = self._error = 0def scan(self, directory, log=False):if log: print('Scanning', directory)try:for root, _, files in walk(abspath(directory)):for filename in files:suffix = filename[filename.rfind('.') + 1:]filename = join(root, filename)if suffix in self._results:lines, size, numFiles = self._results[suffix]lines += 1 # 暂不统计,先按一行计算numFiles += 1size += getsize(filename) # getsize返回文件大小(字节)self._results[suffix] = (lines, size, numFiles)if log: print(filename)except KeyboardInterrupt:print('\nUser stopped operation')else:if log: print('Scan finished')def report(self):print('Language\tLines\tSize\tFiles')for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):print(self._languages[suffix], lines, self.__format_size(size), files, sep='\t')# 单位转换def __format_size(self, bytes):for suffix, size in self.SIZES:if bytes < size * 1024:return '%.2f %s' % (bytes / size, suffix)return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
运行结果应类似于下面这样(手动整理了一下):
Language Lines Size Files
C++ 667 671.51 KB 667
Python 317 981.01 KB 317
HTML 38 466.52 KB 38
Plain text 34 90.69 KB 34
JavaScript 19 1.43 MB 19
CSS 9 341.04 KB 9
C 2 20.45 KB 2
Java 1 676.00 B 1
好,下面来到行数统计部分(表格输出后面会介绍)。
2.2 行数统计
众所周知,空行不应该算在代码行数中。因此,统计时需忽略空行。先写上如下代码(替换掉刚才的23行):
with open(filename, 'r', encoding='utf-8') as f: # utf-8编码打开文件for line in f:if line and not line.isspace(): # 去掉空行lines += 1
但是,正当我们兴致勃勃地运行时——
Traceback (most recent call last):...File "...\lib\codecs.py", line 322, in decode(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 355: invalid start byte
程序报错UnicodeDecodeError
,分析后发现原因是部分文件使用了GBK
编码,而utf-8
编码无法正确打开,因此造成错误。
我们再次改进程序,使其尝试两种编码:
try:ln = 0with open(filename, 'r', encoding='utf-8') as f:for line in f:if line and not line.isspace():ln += 1
except UnicodeDecodeError: # 尝试使用GBK编码打开try:ln = 0with open(filename, 'r', encoding='gbk') as f:for line in f:if line and not line.isspace():ln += 1except:print(filename, '[Error: unknown encoding]')self._error += 1else:lines += ln
except Exception as e:print(filename, '[Unknown error: %s]' % e)self._error += 1continue
lines += ln
if log: print(f'{filename} [{ln}]')
self._successful += 1
这次,我们得到了正确的结果:
Language Lines Size Files
C++ 35595 671.51 KB 667
JavaScript 24485 1.43 MB 19
Python 24130 982.16 KB 317
CSS 8203 341.04 KB 9
HTML 6138 466.52 KB 38
Plain text 741 90.69 KB 34
C 557 20.45 KB 2
Java 29 676.00 B 1
现在仅剩最后一步了——制表。
3. 制表
python
输出表格可以使用PrettyTable
库。具体用法如下:
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTableclass CodeLinesCounter(object):SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]def __init__(self, languages):self._languages = languagesself._results = {suffix: (0, 0, 0) for suffix in languages}self._successful = self._error = 0def scan(self, directory, log=False):if log: print('Scanning', directory)try:for root, _, files in walk(abspath(directory)):for filename in files:suffix = filename[filename.rfind('.') + 1:]filename = join(root, filename)if suffix in self._results:lines, size, numFiles = self._results[suffix]numFiles += 1size += getsize(filename)try:ln = 0with open(filename, 'r', encoding='utf-8') as f:for line in f:if line and not line.isspace():ln += 1except UnicodeDecodeError: # Try 'gbk' encodingtry:ln = 0with open(filename, 'r', encoding='gbk') as f:for line in f:if line and not line.isspace():ln += 1except:print(filename, '[Error: unknown encoding]')self._error += 1else:lines += lnexcept Exception as e:print(filename, '[Unknown error: %s]' % e)self._error += 1continuelines += lnif log: print(f'{filename} [{ln}]')self._successful += 1self._results[suffix] = (lines, size, numFiles)elif log:print(filename, '[None]')except KeyboardInterrupt:print('\nUser stopped operation')else:if log: print('Scan finished')def report(self):table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})') # 创建PrettyTable实例,添加标题for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):table.add_row([self._languages[suffix], lines, self.__format_size(size), files]) # 添加行print(table) # 输出def __format_size(self, bytes):for suffix, size in self.SIZES:if bytes < size * 1024:return '%.2f %s' % (bytes / size, suffix)return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
运行结果:
+----------------------------------------+
| Scan result (OK 1087, Error 0) |
+------------+-------+-----------+-------+
| Language | Lines | Size | Files |
+------------+-------+-----------+-------+
| C++ | 35595 | 671.51 KB | 667 |
| JavaScript | 24485 | 1.43 MB | 19 |
| Python | 24130 | 982.16 KB | 317 |
| CSS | 8203 | 341.04 KB | 9 |
| HTML | 6138 | 466.52 KB | 38 |
| Plain text | 741 | 90.69 KB | 34 |
| C | 557 | 20.45 KB | 2 |
| Java | 29 | 676.00 B | 1 |
+------------+-------+-----------+-------+
4. 总结
最终代码(无注释):
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTableclass CodeLinesCounter(object):SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]def __init__(self, languages):self._languages = languagesself._results = {suffix: (0, 0, 0) for suffix in languages}self._successful = self._error = 0def scan(self, directory, log=False):if log: print('Scanning', directory)try:for root, _, files in walk(abspath(directory)):for filename in files:suffix = filename[filename.rfind('.') + 1:]filename = join(root, filename)if suffix in self._results:lines, size, numFiles = self._results[suffix]numFiles += 1size += getsize(filename)try:ln = 0with open(filename, 'r', encoding='utf-8') as f:for line in f:if line and not line.isspace():ln += 1except UnicodeDecodeError: # Try 'gbk' encodingtry:ln = 0with open(filename, 'r', encoding='gbk') as f:for line in f:if line and not line.isspace():ln += 1except:print(filename, '[Error: unknown encoding]')self._error += 1else:lines += lnexcept Exception as e:print(filename, '[Unknown error: %s]' % e)self._error += 1continuelines += lnif log: print(f'{filename} [{ln}]')self._successful += 1self._results[suffix] = (lines, size, numFiles)elif log:print(filename, '[None]')except KeyboardInterrupt:print('\nUser stopped operation')else:if log: print('Scan finished')def report(self):table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})')for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):table.add_row([self._languages[suffix], lines, self.__format_size(size), files])print(table)def __format_size(self, bytes):for suffix, size in self.SIZES:if bytes < size * 1024:return '%.2f %s' % (bytes / size, suffix)return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
后期改进:
- 增加正则表达式忽略文件
matplotlib
绘图PyQt5
GUI- ……(欢迎提出宝贵的意见!)