Adding work so far

2024-01-29 21:19:10 +01:00 · 2024-01-29 21:19:10 +01:00 · ede1783a3a
commit ede1783a3a
7 changed files with 7871 additions and 0 deletions
--- a/akjv.bbl.mybible/akjv.bbl.db
+++ b/akjv.bbl.mybible/akjv.bbl.db
--- a/extract-text.py
+++ b/extract-text.py
@ -0,0 +1,121 @@
 import PyPDF2
 import pandas as pd
 import regex, re
 def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        lines = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            lines.extend(page.extract_text().split('\n'))
    return lines
 # Replace 'your_pdf_file.pdf' with the actual path to your PDF file
 pdf_path = '/mnt/c/Projects/telosvertaling/telos_nt.pdf'
 extracted_text = extract_text_from_pdf(pdf_path)#.replace('\n', '')
 def merge_entries(verse_list):
    merged_list = []
    current_entry = ""
    for verse in verse_list:
        # Check if the verse starts with a number
        if re.match(r'^\d', verse):
            # If it does, append the current entry to the merged list
            if current_entry:
                merged_list.append(current_entry)
            # Set the current entry to the current verse
            current_entry = verse
        else:
            # If it doesn't start with a number, concatenate to the current entry
            current_entry += ' ' + verse
    # Append the last entry
    if current_entry:
        merged_list.append(current_entry)
    return merged_list
 merged_entries = merge_entries(extracted_text)
 df = pd.DataFrame(merged_entries, columns=['Scripture'])
 # Remove an annoying typo
 df['Scripture'] = df['Scripture'].str.replace(')Efeziers', 'Efeziërs')
 df['Scripture'] = df['Scripture'].str.replace(')Hebreeën', 'Hebreeën')
 df['Scripture'] = df['Scripture'].str.replace('-', '')
 # Extract scripture and verse
 df[['Verse', 'Scripture']] = df['Scripture'].str.split(' ', expand=True, n=1)
 # Extract book name and chapter
 split_columns = df['Scripture'].str.split(' ', expand=True)
 # Extract the last two elements only if the last element is a number
 df['Chapter'] = split_columns.apply(lambda x: next((elem for elem in reversed(x) if isinstance(elem, str) and elem.isdigit()), None), axis=1)
 for index, row in df.iterrows():
    if row['Chapter'] is not None:
        if ',' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(',')[-1]
        elif '.' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('.')[-1]
        elif ';' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(';')[-1]
        elif ':' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(':')[-1]
        elif '?' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('?')[-1]
        elif '!' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('!')[-1]
        elif ')' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(')')[-1]
        elif '-' in row['Scripture'].strip().split(' ')[-2]:
            df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('-)')[-1]
        else:
            # Split on capital letters preceded by lowercase letters
            try:
                # Check if preceding value is a number
                preceding_value = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-3]
                try: 
                    if int(preceding_value):
                        df.at[index, 'Bookname'] = preceding_value + regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-2]
                except:
                        df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[2]
            except IndexError:
                try:
                    df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[1]
                except IndexError:
                    df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[0]
    else:
        df.at[index, 'Bookname'] = None  # Replace 'new_value' with your desired modification
 # Forward and backward fill Book
 df['Bookname'] = df['Bookname'].fillna(method='ffill')
 df['Bookname'] = df['Bookname'].fillna(method='bfill')
 # Forward and backward fill Chapter
 df['Chapter'] = df['Chapter'].fillna(method='ffill')
 df['Chapter'] = df['Chapter'].fillna(method='bfill')
 # Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname'
 for book in df['Bookname'].unique():
    df.loc[df['Bookname'] == book, 'Book'] = list(df['Bookname'].unique()).index(book) + 40
 df.to_csv('/mnt/c/Projects/telosvertaling/telos.csv', index=False)
 # write to sql database
 df.to_sql('telos', 'sqlite:///telos.db', if_exists='replace', index=False)
 # with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f:
 #     f.write(str([text for text in merged_entries]))
--- a/extracted_text.txt
+++ b/extracted_text.txt
--- a/merged_entries.txt
+++ b/merged_entries.txt
@ -0,0 +1 @@
 []
--- a/nieuwe-testament-telos-vertaling.jpg
+++ b/nieuwe-testament-telos-vertaling.jpg
--- a/telos.csv
+++ b/telos.csv
--- a/telos_nt.pdf
+++ b/telos_nt.pdf