当前位置: 首页 > news >正文

flash网站建设技术...商务网站平台建设预算

flash网站建设技术...,商务网站平台建设预算,小学生个人网站怎么做,内网专线和外网专线区别pdfplumber 的特点 1、它是一个纯 python 第三方库,适合 python 3.x 版本 2、它用来查看pdf各类信息,能有效提取文本、表格 3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理 import glob import pdfplumber import re from collection…

pdfplumber 的特点

1、它是一个纯 python 第三方库,适合 python 3.x 版本
2、它用来查看pdf各类信息,能有效提取文本、表格
3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理

import glob
import pdfplumber
import re
from collections import defaultdict
import jsonclass PDFProcessor:def __init__(self, filepath):self.filepath = filepath#打开文档,注意存放的位置self.pdf = pdfplumber.open(filepath)self.all_text = defaultdict(dict)self.allrow = 0self.last_num = 0def check_lines(self, page, top, buttom):# 文本数据lines = page.extract_words()[::]text = ''last_top = 0last_check = 0for l in range(len(lines)):each_line = lines[l]check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'if top == '' and buttom == '':if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']#elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):text = text + each_line['text']else:text = text + '\n' + each_line['text']elif top == '':if each_line['top'] > buttom:if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,text):text = text + each_line['text']else:text = text + '\n' + each_line['text']else:if each_line['top'] < top and each_line['top'] > buttom:if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,text):text = text + each_line['text']else:text = text + '\n' + each_line['text']last_top = each_line['top']last_check = each_line['x1'] - page.width * 0.85return textdef drop_empty_cols(self, data):# 删除所有列为空数据的列transposed_data = list(map(list, zip(*data)))filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]result = list(map(list, zip(*filtered_data)))return result@staticmethoddef keep_visible_lines(obj):"""If the object is a ``rect`` type, keep it only if the lines are visible.A visible line is the one having ``non_stroking_color`` not null."""if obj['object_type'] == 'rect':if obj['non_stroking_color'] is None:return Falseif obj['width'] < 1 and obj['height'] < 1:return False# return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not Noneif obj['object_type'] == 'char':return obj['stroking_color'] is not None and obj['non_stroking_color'] is not Nonereturn Truedef extract_text_and_tables(self, page):buttom = 0page = page.filter(self.keep_visible_lines)tables = page.find_tables()if len(tables) >= 1:# 表格数据count = len(tables)for table in tables:if table.bbox[3] < buttom:passelse:count -= 1top = table.bbox[1]text = self.check_lines(page, top, buttom)text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1buttom = table.bbox[3]new_table = table.extract()r_count = 0for r in range(len(new_table)):row = new_table[r]if row[0] is None:r_count += 1for c in range(len(row)):if row[c] is not None and row[c] not in ['', ' ']:if new_table[r - r_count][c] is None:new_table[r - r_count][c] = row[c]else:new_table[r - r_count][c] += row[c]new_table[r][c] = Noneelse:r_count = 0end_table = []for row in new_table:if row[0] != None:cell_list = []cell_check = Falsefor cell in row:if cell != None:cell = cell.replace('\n', '')else:cell = ''if cell != '':cell_check = Truecell_list.append(cell)if cell_check == True:end_table.append(cell_list)end_table = self.drop_empty_cols(end_table)for row in end_table:self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'excel', 'inside': str(row)}# self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel',#                               'inside': ' '.join(row)}self.allrow += 1if count == 0:text = self.check_lines(page, '', buttom)text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1else:#文本数据text = self.check_lines(page, '', '')text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'if self.last_num == 0:try:first_text = str(self.all_text[1]['inside'])end_text = str(self.all_text[len(self.all_text) - 1]['inside'])if re.search(first_re, first_text) and not '[' in end_text:self.all_text[1]['type'] = '页眉'if re.search(end_re, end_text) and not '[' in end_text:self.all_text[len(self.all_text) - 1]['type'] = '页脚'except:print(page.page_number)else:try:first_text = str(self.all_text[self.last_num + 2]['inside'])end_text = str(self.all_text[len(self.all_text) - 1]['inside'])if re.search(first_re, first_text) and '[' not in end_text:self.all_text[self.last_num + 2]['type'] = '页眉'if re.search(end_re, end_text) and '[' not in end_text:self.all_text[len(self.all_text) - 1]['type'] = '页脚'except:print(page.page_number)self.last_num = len(self.all_text) - 1def process_pdf(self):for i in range(len(self.pdf.pages)):self.extract_text_and_tables(self.pdf.pages[i])def save_all_text(self, path):with open(path, 'w', encoding='utf-8') as file:for key in self.all_text.keys():file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')def process_all_pdfs_in_folder(folder_path):file_paths = glob.glob(f'{folder_path}/*')file_paths = sorted(file_paths, reverse=True)for file_path in file_paths:print(file_path)try:processor = PDFProcessor(file_path)processor.process_pdf()save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt')processor.save_all_text(save_path)except:print('check')if __name__ == '__main__':# 需要解析的pdf文件路径pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.pdf'# pdf解析后的txt内容文件out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txt'processor = PDFProcessor(pdf_path)processor.process_pdf()processor.save_all_text(out_path)

参考

版面分析–PDF解析神器pdfplumber
版面分析–富文本txt读取

补充


提取PDF中的图片并保存到本地

import pdfplumber
file_name = "**.pdf"# 需要解析的pdf的文件路径
output_file = "**.xlsx" # pdf解析后的内容with pdfplumber.open(file_name) as pdf:#获取第一页first_page = pdf.pages[1]print('页码:', first_page.page_number)print('page width:', first_page.width)print('page height:', first_page.height)# get the first page texttext = first_page.extract_text()print(text)# 获取第一页图片,获取到的是一个列表,列表中存储的是字典imgs = first_page.imagesi = 0for img in imgs:# 获取图片的二进制流print(img['stream'].get_data())with open(output_file, mode='wb') as f2:f2.write(img['stream'].get_data())

提取pdf 表格文本,保存为excel文件

import pdfplumber
from openpyxl import Workbook# 保存表格,需要安装openpyxl
file_name = '**.pdf'
output_file = '**.xlsx'
with pdfplumber.open(file_name) as pdf:page01 = pdf.pages[0]table = page01.extract_table()workbook = Workbook()sheet = workbook.activefor row in table:sheet.append(row)workbook.save(filename=output_file)

提取PDF表格 文本

import pdfplumber
file_name = '**.pdf'
output_file = '**.txt'
with pdfplumber.open(file_name) as p:page_count = len(p.pages)# 统计文档的页数for i in range(0, page_count):page = p.pages[i]# 提取每页的对象并存储textdata = page.extract_table()#提取每页的表格文字信息# table2 = page01.extract_tables()# 提取多个表格data = open(output_file , 'a') # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入

提取PDF纯文本

import pdfplumber
file_name = '**.pdf'
output_file = '**.txt'
with pdfplumber.open(file_name) as p:page_count = len(p.pages)# 统计文档的页数for i in range(0, page_count):page = p.pages[i]# 提取每页的对象并存储textdata = page.extract_text()#提取每页的文字信息data = open(output_file , 'a') # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入

读取富文本txt
python 读取文件函数有三种 read()、readline()、readlines()

  • read() 一次性读取所有文本
  • readline() 读取第一行的内容
  • readlines() 读取全部内容,以数列的格式返回
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:data = f.read()print(data)
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:data = f.readline()print(data)
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:for line in f.readlines():line = line.strip('\n')print(line)
http://www.yayakq.cn/news/947698/

相关文章:

  • asp.net网站思路阿里巴巴网站威海哪里做?
  • 上海手机网站青岛煜鹏网站建设公司
  • 龙华网站建设昆山哪里有人做网站
  • 网站空间哪里便宜wordpress全站搜索
  • 微信做引流网站服装设计图
  • 小店网站怎么做免费开放的api网站应用
  • 黄冈论坛百度贴吧aso优化贴吧
  • 遂宁网站seo南山网站建设哪家效益快
  • 家居网站建设定位分析论文政务门户网站建设方案
  • 宁波龙山建设有限公司网站如何给网站做后台
  • 网页设计欣赏案例个人seo优化
  • 山西宏图建设集团有限公司网站南阳做网站公司电话
  • 网站开发逻辑图图标在wordpress
  • 做网站威海江西企业网站定制
  • 网站建设与功能模块益阳网站建设企业
  • 做网站的维护成本旅游网站开发设计文档
  • 签订网站建设合同网站改版业务
  • 做彩铃的网站这么建设一个网站
  • 哪里可以做网站首页设计培训班
  • 如何在国外网站上做外贸上海app开发公司
  • 购买游戏软件做网站网站开发要什么
  • 做企业网站备案收费吗建设购物平台网站
  • 制作网站建设策划方案wordpress文章如何去除p节点
  • 宜宾网站制作公司网站左侧漂浮导航
  • 怎么用dede建设网站上海兴业建设有限公司网站
  • wordpress自定义页面插件长沙优化科技有限公司地址
  • 建设视频网站费用建设手机网站哪个平台比较好
  • 河北省建设工程造价管理协会网站最好的ppt模板网站
  • 建设网站公司兴田德润网站开发 asp.net php
  • 手机网站和电脑网站一样吗html5期末大作业个人网站制作