views 501 words


Apache Tika is a toolkit for extracting content and metadata from various types of documents, such as Word, Excel, and PDF or even multimedia files like JPEG and MP4.

All text-based and multimedia files can be parsed using a common interface, making Tika a powerful and versatile library for content analysis.

def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
    Function to convert PDFs to Dataframe with columns as title & paragraphs.


    min_length : integer
        Minimum character length to be considered as a single paragraph

    include_line_breaks: bool
        To concatenate paragraphs less than min_length to a single paragraph

    df : Dataframe

    If include_line_breaks is set to True, paragraphs with character length
    less than min_length (minimum character length of a paragraph) will be
    considered as a line. Lines before or after each paragraph(length greater
    than or equal to min_length) will be concatenated to a single paragraph to
    form the list of paragraphs in Dataframe.

    Else paragraphs are appended directly to form the list.

    list_file = os.listdir(directory_path)
    list_pdf = []
    for file in list_file:
        if file.endswith("pdf"):
    df = pd.DataFrame(columns=["title", "paragraphs"])
    for i, pdf in enumerate(list_pdf):
            df.loc[i] = [pdf.replace(".pdf",''), None]
            raw = parser.from_file(os.path.join(directory_path, pdf))
            # print(raw)

            s = raw["content"].strip()
            # print(type(s))
            paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s)  # \u2028 is line separator
            # print(paragraphs)
            list_par = []
            temp_para = ""  # variable that stores paragraphs with length<min_length
            for p in paragraphs:
                if not p.isspace():  # checking if paragraph is not only spaces
                    if include_line_breaks:  # if True, check length of paragraph
                        if len(p) >= min_length:
                            if temp_para:
                                # if True, append temp_para which holds concatenated
                                # lines to form a paragraph before current paragraph p
                                temp_para = (
                                )  # reset temp_para for new lines to be concatenated
                                    p.replace("\n", "")
                                )  # append current paragraph with length>min_length
                                list_par.append(p.replace("\n", ""))
                            # paragraph p (line) is concatenated to temp_para
                            line = p.replace("\n", " ").strip()
                            temp_para = temp_para + f" {line}"
                        # appending paragraph p as is to list_par
                        list_par.append(p.replace("\n", ""))
                    if temp_para:

            df.loc[i, "paragraphs"] = list_par
            print("Unexpected error:", sys.exc_info()[0])
            print("Unable to process file {}".format(pdf))
    return df
df = pdf_converter(directory_path='./data/pdf/',include_line_breaks=False)
# print(df.head(5))


def df2squad(df, squad_version="v1.1", output_dir=None, filename=None):
     Converts a pandas dataframe with columns ['title', 'paragraphs'] to a json file with SQuAD format.

    json_data = {}
    json_data["version"] = squad_version
    json_data["data"] = []

    for idx, row in tqdm(df.iterrows()):
        temp = {"title": row["title"], "paragraphs": []}
        for paragraph in row["paragraphs"]:
            temp["paragraphs"].append({"context": paragraph, "qas": []})

    if output_dir:
        with open(os.path.join(output_dir, "{}.json".format(filename)), "w") as outfile:
            json.dump(json_data, outfile)

    return json_data
json_text = df2squad(df)
# print(json_text)
with open("data_file.json", "w") as write_file:
    json.dump(json_text, write_file)


def generate_squad_examples(question, best_idx_scores, metadata):

    squad_examples = []

    metadata_sliced = metadata.loc[best_idx_scores.keys()]  # best_idx_scores.keys(): odict_keys([2387, 2368, 2378, 2374, 2382, 1122, 92, 1053, 2124, 2100, 104, 194, 1099, 1124, 157, 160, 2663, 192, 1877, 1879])

    for idx, row in metadata_sliced.iterrows():
        temp = {"title": row["title"], "paragraphs": []}

        temp["paragraphs"] = [
                "context": row["content"],
                "qas": [
                        "answers": [],
                        "question": question,
                        "id": str(uuid.uuid4()),
                        "retriever_score": best_idx_scores[idx],


    return squad_examples