python

python统计docx和pdf文件的字数

前言

前天在公司接到一个需求，审合同时需要根据合同总字数来做一个字数挡位统计，让我用python来统计一下pdf和docx文件的字数

通过查阅资料结合大模型写了两个函数来统计字数

统计Docx字数

from docx import Document
import re


def get_word_count_like_ms_word(docx_path):
    """
    获取与MS Word显示一致的字数统计（中文+标点）
    """
    doc = Document(docx_path)
    total_count = 0

    for para in doc.paragraphs:
        text = para.text

        # 移除页脚页数（如·1·、- 1 -、第1页等）
        text = re.sub(r'[·\-]\s*\d+\s*[·\-]|第\s*\d+\s*页','', text)

        # 移除换行符、制表符等特殊空白字符
        text = re.sub(r'[\s\t\n\r\u3000]', '', text)

        # 统计剩余所有字符（中文+标点）
        total_count += len(text)

    return total_count


# 使用示例
count = get_word_count_like_ms_word("建筑工程廉政协议20230228171235.docx")
print(f"Word字数: {count}")

统计PDF字数

import pdfplumber
import re

def get_pdf_word_count_like_ms_word(pdf_path):
    """
    获取与MS Word显示一致的PDF字数统计（中文+标点）
    自动移除页脚页数（如·1·、·2·等格式）
    """
    total_count = 0

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 提取页面文本
            text = page.extract_text()

            if text:
                # 移除页脚页数（如·1·、- 1 -、第1页等）
                text = re.sub(r'[·\-]\s*\d+\s*[·\-]|第\s*\d+\s*页','', text)

                # 移除所有空白字符
                cleaned_text = re.sub(r'[\s\u3000]', '', text)

                # 统计剩余字符
                total_count += len(cleaned_text)
    return total_count


# 使用示例
pdf_path = "建筑工程廉政协议20230228171235.pdf"
count = get_pdf_word_count_like_ms_word(pdf_path)
print(f"PDF字数: {count}")