2024-01-29 21:19:10 +01:00
import PyPDF2
import pandas as pd
import regex , re
def extract_text_from_pdf ( pdf_path ) :
text = " "
with open ( pdf_path , ' rb ' ) as file :
pdf_reader = PyPDF2 . PdfReader ( file )
lines = [ ]
for page_num in range ( len ( pdf_reader . pages ) ) :
page = pdf_reader . pages [ page_num ]
lines . extend ( page . extract_text ( ) . split ( ' \n ' ) )
return lines
# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
2024-09-29 22:37:18 +02:00
pdf_path = ' telos_nt.pdf '
2024-01-29 21:19:10 +01:00
extracted_text = extract_text_from_pdf ( pdf_path ) #.replace('\n', '')
def merge_entries ( verse_list ) :
merged_list = [ ]
current_entry = " "
for verse in verse_list :
# Check if the verse starts with a number
if re . match ( r ' ^ \ d ' , verse ) :
# If it does, append the current entry to the merged list
if current_entry :
merged_list . append ( current_entry )
# Set the current entry to the current verse
current_entry = verse
else :
# If it doesn't start with a number, concatenate to the current entry
current_entry + = ' ' + verse
# Append the last entry
if current_entry :
merged_list . append ( current_entry )
return merged_list
merged_entries = merge_entries ( extracted_text )
df = pd . DataFrame ( merged_entries , columns = [ ' Scripture ' ] )
# Remove an annoying typo
df [ ' Scripture ' ] = df [ ' Scripture ' ] . str . replace ( ' )Efeziers ' , ' Efeziërs ' )
df [ ' Scripture ' ] = df [ ' Scripture ' ] . str . replace ( ' )Hebreeën ' , ' Hebreeën ' )
df [ ' Scripture ' ] = df [ ' Scripture ' ] . str . replace ( ' - ' , ' ' )
# Extract scripture and verse
df [ [ ' Verse ' , ' Scripture ' ] ] = df [ ' Scripture ' ] . str . split ( ' ' , expand = True , n = 1 )
# Extract book name and chapter
split_columns = df [ ' Scripture ' ] . str . split ( ' ' , expand = True )
# Extract the last two elements only if the last element is a number
df [ ' Chapter ' ] = split_columns . apply ( lambda x : next ( ( elem for elem in reversed ( x ) if isinstance ( elem , str ) and elem . isdigit ( ) ) , None ) , axis = 1 )
for index , row in df . iterrows ( ) :
if row [ ' Chapter ' ] is not None :
if ' , ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' , ' ) [ - 1 ]
elif ' . ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' . ' ) [ - 1 ]
elif ' ; ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' ; ' ) [ - 1 ]
elif ' : ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' : ' ) [ - 1 ]
elif ' ? ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' ? ' ) [ - 1 ]
elif ' ! ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' ! ' ) [ - 1 ]
elif ' ) ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' ) ' ) [ - 1 ]
elif ' - ' in row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] :
df . at [ index , ' Bookname ' ] = row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' -) ' ) [ - 1 ]
else :
# Split on capital letters preceded by lowercase letters
try :
# Check if preceding value is a number
preceding_value = regex . findall ( r ' \ P {L} \ p {Lu} \ p {Ll} *| \ p {Lu} \ p {Ll} *| \ p {Ll} +|[0-9]+ ' , row [ ' Scripture ' ] . strip ( ) ) [ - 3 ]
try :
if int ( preceding_value ) :
df . at [ index , ' Bookname ' ] = preceding_value + regex . findall ( r ' \ P {L} \ p {Lu} \ p {Ll} *| \ p {Lu} \ p {Ll} *| \ p {Ll} +|[0-9]+ ' , row [ ' Scripture ' ] . strip ( ) ) [ - 2 ]
except :
df . at [ index , ' Bookname ' ] = regex . findall ( r ' \ P {L} \ p {Lu} \ p {Ll} *| \ p {Lu} \ p {Ll} *| \ p {Ll} +|[0-9]+ ' , row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' -) ' ) [ - 1 ] ) [ 2 ]
except IndexError :
try :
df . at [ index , ' Bookname ' ] = regex . findall ( r ' \ P {L} \ p {Lu} \ p {Ll} *| \ p {Lu} \ p {Ll} *| \ p {Ll} +|[0-9]+ ' , row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' -) ' ) [ - 1 ] ) [ 1 ]
except IndexError :
df . at [ index , ' Bookname ' ] = regex . findall ( r ' \ P {L} \ p {Lu} \ p {Ll} *| \ p {Lu} \ p {Ll} *| \ p {Ll} +|[0-9]+ ' , row [ ' Scripture ' ] . strip ( ) . split ( ' ' ) [ - 2 ] . split ( ' -) ' ) [ - 1 ] ) [ 0 ]
else :
df . at [ index , ' Bookname ' ] = None # Replace 'new_value' with your desired modification
# Forward and backward fill Book
df [ ' Bookname ' ] = df [ ' Bookname ' ] . fillna ( method = ' ffill ' )
df [ ' Bookname ' ] = df [ ' Bookname ' ] . fillna ( method = ' bfill ' )
# Forward and backward fill Chapter
df [ ' Chapter ' ] = df [ ' Chapter ' ] . fillna ( method = ' ffill ' )
df [ ' Chapter ' ] = df [ ' Chapter ' ] . fillna ( method = ' bfill ' )
# Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname'
for book in df [ ' Bookname ' ] . unique ( ) :
df . loc [ df [ ' Bookname ' ] == book , ' Book ' ] = list ( df [ ' Bookname ' ] . unique ( ) ) . index ( book ) + 40
2024-09-29 22:37:18 +02:00
df . to_csv ( ' telos.csv ' , index = False )
2024-01-29 21:19:10 +01:00
2024-09-29 22:37:18 +02:00
df_details = pd . DataFrame ( columns = [ ' Description ' , ' Abbreviation ' , ' Comments ' , ' Version ' , ' VersionDate ' , ' PublishDate ' , ' RightToLeft ' , ' OT ' , ' NT ' , ' Strong ' ] )
df_details . loc [ 0 ] = [ ' Telos vertaling ' , ' TELOS ' , ' Telos from PDF. H. Medema Vaassen, revised from the original Voorhoeve translation from 1877. ' , ' 1 ' , ' 2024-09-29 ' , ' 1982-01-01 ' , 0 , 0 , 1 , 0 ]
df [ ' Verse ' ] = df [ ' Verse ' ] . astype ( int )
df [ ' Chapter ' ] = df [ ' Chapter ' ] . astype ( int )
df [ ' Book ' ] = df [ ' Book ' ] . astype ( int )
df [ ' Bookname ' ] = df [ ' Bookname ' ] . astype ( str )
df [ ' Scripture ' ] = df [ ' Scripture ' ] . astype ( str )
# Write both dataframes to a single db file
import sqlite3
conn = sqlite3 . connect ( ' telos.mybible ' )
df [ [ ' Book ' , ' Chapter ' , ' Verse ' , ' Scripture ' ] ] . to_sql ( ' Scripture ' , conn , if_exists = ' replace ' , index = False )
df_details . to_sql ( ' Details ' , conn , if_exists = ' replace ' , index = False )
conn . close ( )
2024-01-29 21:19:10 +01:00
# with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f:
# f.write(str([text for text in merged_entries]))