telos-extract/extract-text.py

import PyPDF2
import pandas as pd
import regex, re

def extract_text_from_pdf(pdf_path):
    text = ""
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        lines = []

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            lines.extend(page.extract_text().split('\n'))

    return lines

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = 'telos_nt.pdf'
extracted_text = extract_text_from_pdf(pdf_path)#.replace('\n', '')

def merge_entries(verse_list):
    merged_list = []
    current_entry = ""

    for verse in verse_list:
        # Check if the verse starts with a number
        if re.match(r'^\d', verse):
            # If it does, append the current entry to the merged list
            if current_entry:
                merged_list.append(current_entry)
            # Set the current entry to the current verse
            current_entry = verse
        else:
            # If it doesn't start with a number, concatenate to the current entry
            current_entry += ' ' + verse

    # Append the last entry
    if current_entry:
        merged_list.append(current_entry)

    return merged_list

merged_entries = merge_entries(extracted_text)
df = pd.DataFrame(merged_entries, columns=['Scripture'])

# Remove an annoying typo
df['Scripture'] = df['Scripture'].str.replace(')Efeziers', 'Efeziërs')
df['Scripture'] = df['Scripture'].str.replace(')Hebreeën', 'Hebreeën')

df['Scripture'] = df['Scripture'].str.replace('-', '')

# Extract scripture and verse
df[['Verse', 'Scripture']] = df['Scripture'].str.split(' ', expand=True, n=1)

# Extract book name and chapter
split_columns = df['Scripture'].str.split(' ', expand=True)


# Extract the last two elements only if the last element is a number
df['Chapter'] = split_columns.apply(lambda x: next((elem for elem in reversed(x) if isinstance(elem, str) and elem.isdigit()), None), axis=1)

for index, row in df.iterrows():
    if row['Chapter'] is not None:
        if ',' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(',')[-1]
        elif '.' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('.')[-1]
        elif ';' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(';')[-1]
        elif ':' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(':')[-1]
        elif '?' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('?')[-1]
        elif '!' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('!')[-1]
        elif ')' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(')')[-1]
        elif '-' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('-)')[-1]
        else:
            # Split on capital letters preceded by lowercase letters
            try:
                # Check if preceding value is a number

                preceding_value = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-3]
                try: 
                    if int(preceding_value):
                        df.at[index, 'Bookname'] = preceding_value + regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-2]
                except:
                        df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[2]
            except IndexError:
                try:
                    df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[1]
                except IndexError:
                    df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[0]
    else:
        df.at[index, 'Bookname'] = None  # Replace 'new_value' with your desired modification

# Forward and backward fill Book
df['Bookname'] = df['Bookname'].fillna(method='ffill')
df['Bookname'] = df['Bookname'].fillna(method='bfill')

# Forward and backward fill Chapter
df['Chapter'] = df['Chapter'].fillna(method='ffill')
df['Chapter'] = df['Chapter'].fillna(method='bfill')

# Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname'
for book in df['Bookname'].unique():
    df.loc[df['Bookname'] == book, 'Book'] = list(df['Bookname'].unique()).index(book) + 40

df.to_csv('telos.csv', index=False)

df_details = pd.DataFrame(columns=['Description', 'Abbreviation', 'Comments', 'Version', 'VersionDate', 'PublishDate', 'RightToLeft', 'OT', 'NT', 'Strong'])
df_details.loc[0] = ['Telos vertaling', 'TELOS', 'Telos from PDF. H. Medema Vaassen, revised from the original Voorhoeve translation from 1877.', '1', '2024-09-29', '1982-01-01', 0, 0, 1, 0]

df['Verse'] = df['Verse'].astype(int)
df['Chapter'] = df['Chapter'].astype(int)
df['Book'] = df['Book'].astype(int)
df['Bookname'] = df['Bookname'].astype(str)
df['Scripture'] = df['Scripture'].astype(str)
# Write both dataframes to a single db file
import sqlite3

conn = sqlite3.connect('telos.mybible')
df[['Book', 'Chapter', 'Verse', 'Scripture']].to_sql('Scripture', conn, if_exists='replace', index=False)
df_details.to_sql('Details', conn, if_exists='replace', index=False)
conn.close()

# with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f:
#     f.write(str([text for text in merged_entries]))
Adding work so far 2024-01-29 21:19:10 +01:00			`import PyPDF2`
			`import pandas as pd`
			`import regex, re`

			`def extract_text_from_pdf(pdf_path):`
			`text = ""`

			`with open(pdf_path, 'rb') as file:`
			`pdf_reader = PyPDF2.PdfReader(file)`
			`lines = []`

			`for page_num in range(len(pdf_reader.pages)):`
			`page = pdf_reader.pages[page_num]`
			`lines.extend(page.extract_text().split('\n'))`

			`return lines`

			`# Replace 'your_pdf_file.pdf' with the actual path to your PDF file`
Added mybible 2024-09-29 22:37:18 +02:00			`pdf_path = 'telos_nt.pdf'`
Adding work so far 2024-01-29 21:19:10 +01:00			`extracted_text = extract_text_from_pdf(pdf_path)#.replace('\n', '')`

			`def merge_entries(verse_list):`
			`merged_list = []`
			`current_entry = ""`

			`for verse in verse_list:`
			`# Check if the verse starts with a number`
			`if re.match(r'^\d', verse):`
			`# If it does, append the current entry to the merged list`
			`if current_entry:`
			`merged_list.append(current_entry)`
			`# Set the current entry to the current verse`
			`current_entry = verse`
			`else:`
			`# If it doesn't start with a number, concatenate to the current entry`
			`current_entry += ' ' + verse`

			`# Append the last entry`
			`if current_entry:`
			`merged_list.append(current_entry)`

			`return merged_list`

			`merged_entries = merge_entries(extracted_text)`
			`df = pd.DataFrame(merged_entries, columns=['Scripture'])`

			`# Remove an annoying typo`
			`df['Scripture'] = df['Scripture'].str.replace(')Efeziers', 'Efeziërs')`
			`df['Scripture'] = df['Scripture'].str.replace(')Hebreeën', 'Hebreeën')`

			`df['Scripture'] = df['Scripture'].str.replace('-', '')`

			`# Extract scripture and verse`
			`df[['Verse', 'Scripture']] = df['Scripture'].str.split(' ', expand=True, n=1)`

			`# Extract book name and chapter`
			`split_columns = df['Scripture'].str.split(' ', expand=True)`



			`# Extract the last two elements only if the last element is a number`
			`df['Chapter'] = split_columns.apply(lambda x: next((elem for elem in reversed(x) if isinstance(elem, str) and elem.isdigit()), None), axis=1)`

			`for index, row in df.iterrows():`
			`if row['Chapter'] is not None:`
			`if ',' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(',')[-1]`
			`elif '.' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('.')[-1]`
			`elif ';' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(';')[-1]`
			`elif ':' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(':')[-1]`
			`elif '?' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('?')[-1]`
			`elif '!' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('!')[-1]`
			`elif ')' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(')')[-1]`
			`elif '-' in row['Scripture'].strip().split(' ')[-2]:`
			`df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('-)')[-1]`
			`else:`
			`# Split on capital letters preceded by lowercase letters`
			`try:`
			`# Check if preceding value is a number`

			`preceding_value = regex.findall(r'\P{L}\p{Lu}\p{Ll}\|\p{Lu}\p{Ll}\|\p{Ll}+\|[0-9]+', row['Scripture'].strip())[-3]`
			`try:`
			`if int(preceding_value):`
			`df.at[index, 'Bookname'] = preceding_value + regex.findall(r'\P{L}\p{Lu}\p{Ll}\|\p{Lu}\p{Ll}\|\p{Ll}+\|[0-9]+', row['Scripture'].strip())[-2]`
			`except:`
			`df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}\|\p{Lu}\p{Ll}\|\p{Ll}+\|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[2]`
			`except IndexError:`
			`try:`
			`df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}\|\p{Lu}\p{Ll}\|\p{Ll}+\|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[1]`
			`except IndexError:`
			`df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}\|\p{Lu}\p{Ll}\|\p{Ll}+\|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[0]`
			`else:`
			`df.at[index, 'Bookname'] = None # Replace 'new_value' with your desired modification`

			`# Forward and backward fill Book`
			`df['Bookname'] = df['Bookname'].fillna(method='ffill')`
			`df['Bookname'] = df['Bookname'].fillna(method='bfill')`

			`# Forward and backward fill Chapter`
			`df['Chapter'] = df['Chapter'].fillna(method='ffill')`
			`df['Chapter'] = df['Chapter'].fillna(method='bfill')`

			`# Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname'`
			`for book in df['Bookname'].unique():`
			`df.loc[df['Bookname'] == book, 'Book'] = list(df['Bookname'].unique()).index(book) + 40`

Added mybible 2024-09-29 22:37:18 +02:00			`df.to_csv('telos.csv', index=False)`
Adding work so far 2024-01-29 21:19:10 +01:00
Added mybible 2024-09-29 22:37:18 +02:00			`df_details = pd.DataFrame(columns=['Description', 'Abbreviation', 'Comments', 'Version', 'VersionDate', 'PublishDate', 'RightToLeft', 'OT', 'NT', 'Strong'])`
			`df_details.loc[0] = ['Telos vertaling', 'TELOS', 'Telos from PDF. H. Medema Vaassen, revised from the original Voorhoeve translation from 1877.', '1', '2024-09-29', '1982-01-01', 0, 0, 1, 0]`

			`df['Verse'] = df['Verse'].astype(int)`
			`df['Chapter'] = df['Chapter'].astype(int)`
			`df['Book'] = df['Book'].astype(int)`
			`df['Bookname'] = df['Bookname'].astype(str)`
			`df['Scripture'] = df['Scripture'].astype(str)`
			`# Write both dataframes to a single db file`
			`import sqlite3`

			`conn = sqlite3.connect('telos.mybible')`
			`df[['Book', 'Chapter', 'Verse', 'Scripture']].to_sql('Scripture', conn, if_exists='replace', index=False)`
			`df_details.to_sql('Details', conn, if_exists='replace', index=False)`
			`conn.close()`
Adding work so far 2024-01-29 21:19:10 +01:00
			`# with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f:`
			`# f.write(str([text for text in merged_entries]))`