Spaces:

RenAzum
/

documentAnalyzer

Sleeping

App Files Files Community

RenAzum commited on Nov 13, 2024

Commit

d84f60a

1 Parent(s): 90d777d

Document Analysis

Browse files

Files changed (2) hide show

main.py +107 -0
requirements.txt +45 -0

main.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+import fitz  # PyMuPDF
+import docx
+from difflib import HtmlDiff, SequenceMatcher
+import os
+import re
+# Functions to extract text and metadata
+def extract_text_pdf(file):
+    doc = fitz.open(file)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def extract_text_word(file):
+    doc = docx.Document(file)
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return text
+def extract_metadata_pdf(file):
+    doc = fitz.open(file)
+    metadata = doc.metadata
+    return metadata
+def extract_metadata_word(file):
+    doc = docx.Document(file)
+    core_props = doc.core_properties
+    metadata = {
+        "author": core_props.author,
+        "created": core_props.created,
+        "modified": core_props.modified
+    }
+    return metadata
+# Function to compare text using difflib and return highlighted HTML differences
+def compare_texts(text1, text2):
+    differ = HtmlDiff()
+    return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
+# Function to calculate similarity score
+def calculate_similarity(text1, text2):
+    matcher = SequenceMatcher(None, text1, text2)
+    return matcher.ratio()
+# Streamlit App Interface
+st.title("Document Edit Detection POC")
+st.write("Upload both the original and edited documents below:")
+# File upload
+original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"])
+edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])
+# Process if both files are uploaded
+if original_file and edited_file:
+    # Identify file types
+    original_ext = os.path.splitext(original_file.name)[1]
+    edited_ext = os.path.splitext(edited_file.name)[1]
+    # Check if both files are of the same type
+    if original_ext != edited_ext:
+        st.error("Both documents must be of the same type (PDF or DOCX).")
+    else:
+        # Extract text and metadata
+        if original_ext == ".pdf":
+            original_text = extract_text_pdf(original_file)
+            edited_text = extract_text_pdf(edited_file)
+            original_metadata = extract_metadata_pdf(original_file)
+            edited_metadata = extract_metadata_pdf(edited_file)
+        else:
+            original_text = extract_text_word(original_file)
+            edited_text = extract_text_word(edited_file)
+            original_metadata = extract_metadata_word(original_file)
+            edited_metadata = extract_metadata_word(edited_file)
+        # Display Metadata
+        st.subheader("Metadata Comparison")
+        metadata_match = original_metadata == edited_metadata
+        st.write("Metadata Match:", metadata_match)
+        st.write("Original Document Metadata:")
+        st.write(original_metadata)
+        st.write("Edited Document Metadata:")
+        st.write(edited_metadata)
+        # Compare text
+        st.subheader("Text Comparison")
+        text_diff_html = compare_texts(original_text, edited_text)
+        similarity_score = calculate_similarity(original_text, edited_text)
+        st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
+        text_match = similarity_score == 1.0
+        st.write("Text Match:", text_match)
+        # Display highlighted text differences
+        st.write("Differences:")
+        st.components.v1.html(text_diff_html, height=400, scrolling=True)
+        # Report Generation
+        st.subheader("Report Summary")
+        st.write("Metadata Match:", metadata_match)
+        st.write("Text Match:", text_match)
+        st.write("Similarity Score:", round(similarity_score * 100, 2), "%")
+else:
+    st.info("Please upload both the original and edited documents to proceed.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+altair==5.4.1
+attrs==24.2.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+gitdb==4.0.11
+GitPython==3.1.43
+idna==3.10
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lxml==5.3.0
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+narwhals==1.13.4
+numpy==2.1.3
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+protobuf==5.28.3
+pyarrow==18.0.0
+pydeck==0.9.1
+Pygments==2.18.0
+PyMuPDF==1.24.13
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+pytz==2024.2
+referencing==0.35.1
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+six==1.16.0
+smmap==5.0.1
+streamlit==1.40.1
+tenacity==9.0.0
+toml==0.10.2
+tornado==6.4.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==6.0.0