Converter
Tika
Apache Tika is a toolkit for extracting content and metadata from various types of documents, such as Word, Excel, and PDF or even multimedia files like JPEG and MP4.
All text-based and multimedia files can be parsed using a common interface, making Tika a powerful and versatile library for content analysis.
def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
"""
Function to convert PDFs to Dataframe with columns as title & paragraphs.
Parameters
----------
min_length : integer
Minimum character length to be considered as a single paragraph
include_line_breaks: bool
To concatenate paragraphs less than min_length to a single paragraph
Returns
-------------
df : Dataframe
Description
-----------------
If include_line_breaks is set to True, paragraphs with character length
less than min_length (minimum character length of a paragraph) will be
considered as a line. Lines before or after each paragraph(length greater
than or equal to min_length) will be concatenated to a single paragraph to
form the list of paragraphs in Dataframe.
Else paragraphs are appended directly to form the list.
"""
list_file = os.listdir(directory_path)
list_pdf = []
for file in list_file:
if file.endswith("pdf"):
list_pdf.append(file)
df = pd.DataFrame(columns=["title", "paragraphs"])
for i, pdf in enumerate(list_pdf):
try:
df.loc[i] = [pdf.replace(".pdf",''), None]
raw = parser.from_file(os.path.join(directory_path, pdf))
# print(raw)
s = raw["content"].strip()
# print(type(s))
paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s) # \u2028 is line separator
# print(paragraphs)
list_par = []
temp_para = "" # variable that stores paragraphs with length<min_length
for p in paragraphs:
if not p.isspace(): # checking if paragraph is not only spaces
if include_line_breaks: # if True, check length of paragraph
if len(p) >= min_length:
if temp_para:
# if True, append temp_para which holds concatenated
# lines to form a paragraph before current paragraph p
list_par.append(temp_para.strip())
temp_para = (
""
) # reset temp_para for new lines to be concatenated
list_par.append(
p.replace("\n", "")
) # append current paragraph with length>min_length
else:
list_par.append(p.replace("\n", ""))
else:
# paragraph p (line) is concatenated to temp_para
line = p.replace("\n", " ").strip()
temp_para = temp_para + f" {line}"
else:
# appending paragraph p as is to list_par
list_par.append(p.replace("\n", ""))
else:
if temp_para:
list_par.append(temp_para.strip())
df.loc[i, "paragraphs"] = list_par
except:
print("Unexpected error:", sys.exc_info()[0])
print("Unable to process file {}".format(pdf))
return df
df = pdf_converter(directory_path='./data/pdf/',include_line_breaks=False)
# print(df.head(5))
df.to_csv('./pdf2txt.csv')
df2squad
def df2squad(df, squad_version="v1.1", output_dir=None, filename=None):
"""
Converts a pandas dataframe with columns ['title', 'paragraphs'] to a json file with SQuAD format.
"""
json_data = {}
json_data["version"] = squad_version
json_data["data"] = []
for idx, row in tqdm(df.iterrows()):
temp = {"title": row["title"], "paragraphs": []}
for paragraph in row["paragraphs"]:
temp["paragraphs"].append({"context": paragraph, "qas": []})
json_data["data"].append(temp)
if output_dir:
with open(os.path.join(output_dir, "{}.json".format(filename)), "w") as outfile:
json.dump(json_data, outfile)
return json_data
json_text = df2squad(df)
# print(json_text)
with open("data_file.json", "w") as write_file:
json.dump(json_text, write_file)
generate_squad_examples
def generate_squad_examples(question, best_idx_scores, metadata):
squad_examples = []
metadata_sliced = metadata.loc[best_idx_scores.keys()] # best_idx_scores.keys(): odict_keys([2387, 2368, 2378, 2374, 2382, 1122, 92, 1053, 2124, 2100, 104, 194, 1099, 1124, 157, 160, 2663, 192, 1877, 1879])
for idx, row in metadata_sliced.iterrows():
temp = {"title": row["title"], "paragraphs": []}
temp["paragraphs"] = [
{
"context": row["content"],
"qas": [
{
"answers": [],
"question": question,
"id": str(uuid.uuid4()),
"retriever_score": best_idx_scores[idx],
}
],
}
]
squad_examples.append(temp)
return squad_examples