import PyPDF2 import pandas as pd import regex, re def extract_text_from_pdf(pdf_path): text = "" with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) lines = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] lines.extend(page.extract_text().split('\n')) return lines # Replace 'your_pdf_file.pdf' with the actual path to your PDF file pdf_path = 'telos_nt.pdf' extracted_text = extract_text_from_pdf(pdf_path)#.replace('\n', '') def merge_entries(verse_list): merged_list = [] current_entry = "" for verse in verse_list: # Check if the verse starts with a number if re.match(r'^\d', verse): # If it does, append the current entry to the merged list if current_entry: merged_list.append(current_entry) # Set the current entry to the current verse current_entry = verse else: # If it doesn't start with a number, concatenate to the current entry current_entry += ' ' + verse # Append the last entry if current_entry: merged_list.append(current_entry) return merged_list merged_entries = merge_entries(extracted_text) df = pd.DataFrame(merged_entries, columns=['Scripture']) # Remove an annoying typo df['Scripture'] = df['Scripture'].str.replace(')Efeziers', 'Efeziërs') df['Scripture'] = df['Scripture'].str.replace(')Hebreeën', 'Hebreeën') df['Scripture'] = df['Scripture'].str.replace('-', '') # Extract scripture and verse df[['Verse', 'Scripture']] = df['Scripture'].str.split(' ', expand=True, n=1) # Extract book name and chapter split_columns = df['Scripture'].str.split(' ', expand=True) # Extract the last two elements only if the last element is a number df['Chapter'] = split_columns.apply(lambda x: next((elem for elem in reversed(x) if isinstance(elem, str) and elem.isdigit()), None), axis=1) for index, row in df.iterrows(): if row['Chapter'] is not None: if ',' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(',')[-1] elif '.' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('.')[-1] elif ';' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(';')[-1] elif ':' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(':')[-1] elif '?' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('?')[-1] elif '!' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('!')[-1] elif ')' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(')')[-1] elif '-' in row['Scripture'].strip().split(' ')[-2]: df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('-)')[-1] else: # Split on capital letters preceded by lowercase letters try: # Check if preceding value is a number preceding_value = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-3] try: if int(preceding_value): df.at[index, 'Bookname'] = preceding_value + regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-2] except: df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[2] except IndexError: try: df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[1] except IndexError: df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[0] else: df.at[index, 'Bookname'] = None # Replace 'new_value' with your desired modification # Forward and backward fill Book df['Bookname'] = df['Bookname'].fillna(method='ffill') df['Bookname'] = df['Bookname'].fillna(method='bfill') # Forward and backward fill Chapter df['Chapter'] = df['Chapter'].fillna(method='ffill') df['Chapter'] = df['Chapter'].fillna(method='bfill') # Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname' for book in df['Bookname'].unique(): df.loc[df['Bookname'] == book, 'Book'] = list(df['Bookname'].unique()).index(book) + 40 df.to_csv('telos.csv', index=False) df_details = pd.DataFrame(columns=['Description', 'Abbreviation', 'Comments', 'Version', 'VersionDate', 'PublishDate', 'RightToLeft', 'OT', 'NT', 'Strong']) df_details.loc[0] = ['Telos vertaling', 'TELOS', 'Telos from PDF. H. Medema Vaassen, revised from the original Voorhoeve translation from 1877.', '1', '2024-09-29', '1982-01-01', 0, 0, 1, 0] df['Verse'] = df['Verse'].astype(int) df['Chapter'] = df['Chapter'].astype(int) df['Book'] = df['Book'].astype(int) df['Bookname'] = df['Bookname'].astype(str) df['Scripture'] = df['Scripture'].astype(str) # Write both dataframes to a single db file import sqlite3 conn = sqlite3.connect('telos.mybible') df[['Book', 'Chapter', 'Verse', 'Scripture']].to_sql('Scripture', conn, if_exists='replace', index=False) df_details.to_sql('Details', conn, if_exists='replace', index=False) conn.close() # with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f: # f.write(str([text for text in merged_entries]))