信息发布模块

lijunliang b68c46ebc1 first commit		1 年之前
docs	b68c46ebc1 first commit	1 年之前
file_processing	b68c46ebc1 first commit	1 年之前
need_package	b68c46ebc1 first commit	1 年之前
proto	b68c46ebc1 first commit	1 年之前
test	b68c46ebc1 first commit	1 年之前
util	b68c46ebc1 first commit	1 年之前
app.py	b68c46ebc1 first commit	1 年之前
extract.sh	b68c46ebc1 first commit	1 年之前
extractFileServer.py	b68c46ebc1 first commit	1 年之前
nsq_server.py	b68c46ebc1 first commit	1 年之前
readme.md	b68c46ebc1 first commit	1 年之前

pdf文件抽取

###一、文件包改造： 1.pdfplumber中

table.py（extract函数）

    for row in self.rows:
        arr = []
        row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
        clear_chars=[]
        l_x0, l_x1, l_y0, l_y1 = 0, 0, 0, 0
        for clear_char in row_chars:
            char_x0, char_x1, char_y0, char_y1 = clear_char.get("x0"), clear_char.get("x1"), clear_char.get("y0"), clear_char.get("y1")
            if char_x0 <= (l_x1+l_x0)/2  and (char_y0 + char_y1) / 2 > l_y0:
                continue
            l_x0, l_x1, l_y0, l_y1 = char_x0, char_x1, char_y0, char_y1
            clear_chars.append(clear_char)
        row_chars=clear_chars
        for cell in row.cells:
            if cell is None:
                cell_text = None
            else:
                cell_chars = [
                    char for char in row_chars if char_in_bbox(char, cell)
                ]

                if len(cell_chars):
                    cell_text = utils.extract_text(
                        cell_chars,
                        x_tolerance=x_tolerance,
                        y_tolerance=y_tolerance,
                    ).strip()
                else:
                    cell_text = ""
            arr.append(cell_text)
        table_arr.append(arr)

2.pdfplumber中util.py(iter_chars_to_words函数)

def iter_chars_to_words(self, chars):
    current_word = []
    current_bbox = None
    l_x0, l_x1, l_y0, l_y1 = 0, 0, 0, 0
    for char in chars:
        if not self.keep_blank_chars and char["text"].isspace():
            if current_word:
                yield current_word
                current_word = []
                current_bbox = None

        elif current_word and self.char_begins_new_word(
                current_word, current_bbox, char
        ):
            yield current_word
            l_x0, l_x1, l_y0, l_y1 = char.get("x0"), char.get("x1"), char.get("y0"), char.get("y1") #add
            current_word = [char]
            current_bbox = obj_to_bbox(char)

        else:
            #add
            char_x0, char_x1, char_y0, char_y1 = char.get("x0"), char.get("x1"), char.get("y0"), char.get("y1")
            # if (char_x1 + char_x0) / 2 < l_x0 and (char_y0 + char_y1) / 2 < l_y0:
            if char_x0 <= (l_x1+l_x0)/2  and (char_y0 + char_y1) / 2 > l_y0:
                continue
            # print("char-->", char.get("text"))
            l_x0, l_x1, l_y0, l_y1 = char_x0, char_x1, char_y0, char_y1

            current_word.append(char)
            if current_bbox is None:
                current_bbox = obj_to_bbox(char)
            else:
                current_bbox = merge_bboxes([current_bbox, obj_to_bbox(char)])

    if current_word:
        yield current_word

##二、环境安装 ###1、软件需求

libreoffice软件
unrar与unzip
swftools

readme.md

pdf文件抽取