|
1 年之前 | |
---|---|---|
docs | 1 年之前 | |
file_processing | 1 年之前 | |
need_package | 1 年之前 | |
proto | 1 年之前 | |
test | 1 年之前 | |
util | 1 年之前 | |
app.py | 1 年之前 | |
extract.sh | 1 年之前 | |
extractFileServer.py | 1 年之前 | |
nsq_server.py | 1 年之前 | |
readme.md | 1 年之前 |
###一、文件包改造: 1.pdfplumber中
table.py(extract函数)
for row in self.rows:
arr = []
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
clear_chars=[]
l_x0, l_x1, l_y0, l_y1 = 0, 0, 0, 0
for clear_char in row_chars:
char_x0, char_x1, char_y0, char_y1 = clear_char.get("x0"), clear_char.get("x1"), clear_char.get("y0"), clear_char.get("y1")
if char_x0 <= (l_x1+l_x0)/2 and (char_y0 + char_y1) / 2 > l_y0:
continue
l_x0, l_x1, l_y0, l_y1 = char_x0, char_x1, char_y0, char_y1
clear_chars.append(clear_char)
row_chars=clear_chars
for cell in row.cells:
if cell is None:
cell_text = None
else:
cell_chars = [
char for char in row_chars if char_in_bbox(char, cell)
]
if len(cell_chars):
cell_text = utils.extract_text(
cell_chars,
x_tolerance=x_tolerance,
y_tolerance=y_tolerance,
).strip()
else:
cell_text = ""
arr.append(cell_text)
table_arr.append(arr)
2.pdfplumber中util.py(iter_chars_to_words函数)
def iter_chars_to_words(self, chars):
current_word = []
current_bbox = None
l_x0, l_x1, l_y0, l_y1 = 0, 0, 0, 0
for char in chars:
if not self.keep_blank_chars and char["text"].isspace():
if current_word:
yield current_word
current_word = []
current_bbox = None
elif current_word and self.char_begins_new_word(
current_word, current_bbox, char
):
yield current_word
l_x0, l_x1, l_y0, l_y1 = char.get("x0"), char.get("x1"), char.get("y0"), char.get("y1") #add
current_word = [char]
current_bbox = obj_to_bbox(char)
else:
#add
char_x0, char_x1, char_y0, char_y1 = char.get("x0"), char.get("x1"), char.get("y0"), char.get("y1")
# if (char_x1 + char_x0) / 2 < l_x0 and (char_y0 + char_y1) / 2 < l_y0:
if char_x0 <= (l_x1+l_x0)/2 and (char_y0 + char_y1) / 2 > l_y0:
continue
# print("char-->", char.get("text"))
l_x0, l_x1, l_y0, l_y1 = char_x0, char_x1, char_y0, char_y1
current_word.append(char)
if current_bbox is None:
current_bbox = obj_to_bbox(char)
else:
current_bbox = merge_bboxes([current_bbox, obj_to_bbox(char)])
if current_word:
yield current_word
##二、环境安装 ###1、软件需求
libreoffice软件
unrar与unzip
swftools