| import gradio as gr |
| import warnings |
| from typing import List |
| import json |
| from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf |
|
|
| from base_utils import ( |
| convert_pdf_to_image, |
| extract_text_from_pdf, |
| convert_doc_to_text, |
| extract_text_from_docx, |
| extract_text_from_ppt, |
| extract_text_from_pptx, |
| sanitize_list_of_lists, |
| parse_url, |
| ) |
|
|
| pdf_to_img = gr.Interface( |
| convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" |
| ) |
| pdf_to_text = gr.Interface( |
| extract_text_from_pdf, |
| gr.File(), |
| gr.Textbox(placeholder="Extracted text will appear here"), |
| api_name="pdf_to_text", |
| ) |
|
|
| doc_to_text = gr.Interface( |
| convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text" |
| ) |
| docx_to_text = gr.Interface( |
| extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text" |
| ) |
|
|
| ppt_to_text = gr.Interface( |
| extract_text_from_ppt, |
| gr.File(), |
| gr.Textbox(), |
| api_name="ppt_to_text", |
| ) |
|
|
| pptx_to_text = gr.Interface( |
| extract_text_from_pptx, |
| gr.File(), |
| gr.Textbox(), |
| api_name="pptx_to_text", |
| ) |
| str_to_json = gr.Interface( |
| sanitize_list_of_lists, |
| gr.Text(), |
| gr.JSON(), |
| api_name="str_to_json", |
| examples=[ |
| """[ |
| ["What year was the Carthaginian Empire founded?", "Around 814 BCE"], |
| ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], |
| ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], |
| ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], |
| ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], |
| ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], |
| ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], |
| ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] |
| ]""" |
| ], |
| ) |
|
|
| url_parser = gr.Interface( |
| parse_url, |
| inputs=["text"], |
| outputs=["text"], |
| api_name="url_to_text", |
| ) |
|
|
|
|
| class FileNotConvertedWarning(Warning): |
| """The file was not in one of the specified formats for conversion to PDF""" |
|
|
| pass |
|
|
|
|
| def to_pdf(files: List[str]) -> List[str]: |
| pdfs = [] |
| for f in files: |
| if f.endswith(".docx"): |
| newfile = f.replace(".docx", ".pdf") |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| elif f.endswith(".pdf"): |
| pdfs.append(f) |
| elif f.endswith(".html"): |
| newfile = f.replace(".html", ".pdf") |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| elif f.endswith(".pptx"): |
| newfile = f.replace(".pptx", ".pdf") |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| elif f.endswith(".csv"): |
| newfile = f.replace(".csv", ".pdf") |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| elif f.endswith(".xml"): |
| newfile = f.replace(".xml", ".pdf") |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| elif f.endswith(".md"): |
| newfile = f.replace(".md", ".pdf") |
| file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0]) |
| pdfs.append(file_to_add) |
| else: |
| warnings.warn( |
| f"File {f} was not converted to PDF because its file format is not included in those that can be converted", |
| FileNotConvertedWarning, |
| ) |
| continue |
| return pdfs |
|
|
|
|
| def convert(file: str) -> str: |
| files = [file] |
| pdfs = to_pdf(files) |
| return pdfs |
|
|
|
|
| def parse_MCQs(mcq_string: str) -> List[List[str]]: |
| mcq_string = "[" + mcq_string.split("[", 1)[1] |
| json_data = mcq_string.rsplit("]", 1)[0] + "]" |
| json_data = json.loads(json_data) |
| return json_data |
|
|
|
|
| mcqs_to_json = gr.Interface( |
| parse_MCQs, |
| gr.Textbox(), |
| gr.JSON(), |
| api_name="mcqs_to_json", |
| examples=[ |
| [ |
| """```json |
| [ |
| { |
| "question": "Which of the following best describes the nature of business?", |
| "options": { |
| "A": "It is primarily a non-economic activity", |
| "B": "It involves personal consumption of goods", |
| "C": "It includes regular and continuous transactions for profit", |
| "D": "It excludes exchange of goods and services" |
| }, |
| "answer": "C" |
| }, |
| { |
| "question": "According to the document, what is a primary objective of business under economic objectives?", |
| "options": { |
| "A": "Employee welfare", |
| "B": "Profit earning", |
| "C": "Creating entertainment content", |
| "D": "Reducing government involvement" |
| }, |
| "answer": "B" |
| }, |
| { |
| "question": "Which of the following is a component of commerce?", |
| "options": { |
| "A": "Mining", |
| "B": "Manufacturing", |
| "C": "Warehousing", |
| "D": "Farming" |
| }, |
| "answer": "C" |
| }, |
| { |
| "question": "What is an example of a synthetic manufacturing industry?", |
| "options": { |
| "A": "Oil refining", |
| "B": "Textile processing", |
| "C": "Soap production", |
| "D": "Watch assembly" |
| }, |
| "answer": "C" |
| }, |
| { |
| "question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?", |
| "options": { |
| "A": "Banking", |
| "B": "Insurance", |
| "C": "Advertising", |
| "D": "Warehousing" |
| }, |
| "answer": "C" |
| } |
| ] |
| ``` |
| """ |
| ] |
| ], |
| cache_examples=False, |
| ) |
|
|
| pdf_converter = gr.Interface( |
| fn=convert, |
| inputs=gr.File(label="Upload your file"), |
| outputs=gr.File(label="Converted PDF"), |
| title="File to PDF Converter", |
| description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.", |
| api_name="convert_to_pdf", |
| ) |
|
|
| demo = gr.TabbedInterface( |
| [ |
| pdf_to_img, |
| pdf_to_text, |
| doc_to_text, |
| docx_to_text, |
| ppt_to_text, |
| pptx_to_text, |
| url_parser, |
| str_to_json, |
| mcqs_to_json, |
| pdf_converter, |
| ], |
| [ |
| "PDF to Image", |
| "Extract PDF Text", |
| "Extract DOC Text", |
| "Extract DOCX Text", |
| "Extract PPT Text", |
| "Extract PPTX Text", |
| "Extract text from URL", |
| "Extract Json", |
| "Parse MCQs", |
| "Convert to PDF", |
| ], |
| ) |
|
|
| demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True) |
|
|