Adding work so far
This commit is contained in:
commit
ede1783a3a
BIN
akjv.bbl.mybible/akjv.bbl.db
Normal file
BIN
akjv.bbl.mybible/akjv.bbl.db
Normal file
Binary file not shown.
121
extract-text.py
Normal file
121
extract-text.py
Normal file
@ -0,0 +1,121 @@
|
||||
import PyPDF2
|
||||
import pandas as pd
|
||||
import regex, re
|
||||
|
||||
def extract_text_from_pdf(pdf_path):
|
||||
text = ""
|
||||
|
||||
with open(pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
lines = []
|
||||
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
lines.extend(page.extract_text().split('\n'))
|
||||
|
||||
return lines
|
||||
|
||||
# Replace 'your_pdf_file.pdf' with the actual path to your PDF file
|
||||
pdf_path = '/mnt/c/Projects/telosvertaling/telos_nt.pdf'
|
||||
extracted_text = extract_text_from_pdf(pdf_path)#.replace('\n', '')
|
||||
|
||||
def merge_entries(verse_list):
|
||||
merged_list = []
|
||||
current_entry = ""
|
||||
|
||||
for verse in verse_list:
|
||||
# Check if the verse starts with a number
|
||||
if re.match(r'^\d', verse):
|
||||
# If it does, append the current entry to the merged list
|
||||
if current_entry:
|
||||
merged_list.append(current_entry)
|
||||
# Set the current entry to the current verse
|
||||
current_entry = verse
|
||||
else:
|
||||
# If it doesn't start with a number, concatenate to the current entry
|
||||
current_entry += ' ' + verse
|
||||
|
||||
# Append the last entry
|
||||
if current_entry:
|
||||
merged_list.append(current_entry)
|
||||
|
||||
return merged_list
|
||||
|
||||
merged_entries = merge_entries(extracted_text)
|
||||
df = pd.DataFrame(merged_entries, columns=['Scripture'])
|
||||
|
||||
# Remove an annoying typo
|
||||
df['Scripture'] = df['Scripture'].str.replace(')Efeziers', 'Efeziërs')
|
||||
df['Scripture'] = df['Scripture'].str.replace(')Hebreeën', 'Hebreeën')
|
||||
|
||||
df['Scripture'] = df['Scripture'].str.replace('-', '')
|
||||
|
||||
# Extract scripture and verse
|
||||
df[['Verse', 'Scripture']] = df['Scripture'].str.split(' ', expand=True, n=1)
|
||||
|
||||
# Extract book name and chapter
|
||||
split_columns = df['Scripture'].str.split(' ', expand=True)
|
||||
|
||||
|
||||
|
||||
# Extract the last two elements only if the last element is a number
|
||||
df['Chapter'] = split_columns.apply(lambda x: next((elem for elem in reversed(x) if isinstance(elem, str) and elem.isdigit()), None), axis=1)
|
||||
|
||||
for index, row in df.iterrows():
|
||||
if row['Chapter'] is not None:
|
||||
if ',' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(',')[-1]
|
||||
elif '.' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('.')[-1]
|
||||
elif ';' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(';')[-1]
|
||||
elif ':' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(':')[-1]
|
||||
elif '?' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('?')[-1]
|
||||
elif '!' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('!')[-1]
|
||||
elif ')' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split(')')[-1]
|
||||
elif '-' in row['Scripture'].strip().split(' ')[-2]:
|
||||
df.at[index, 'Bookname'] = row['Scripture'].strip().split(' ')[-2].split('-)')[-1]
|
||||
else:
|
||||
# Split on capital letters preceded by lowercase letters
|
||||
try:
|
||||
# Check if preceding value is a number
|
||||
|
||||
preceding_value = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-3]
|
||||
try:
|
||||
if int(preceding_value):
|
||||
df.at[index, 'Bookname'] = preceding_value + regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip())[-2]
|
||||
except:
|
||||
df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[2]
|
||||
except IndexError:
|
||||
try:
|
||||
df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[1]
|
||||
except IndexError:
|
||||
df.at[index, 'Bookname'] = regex.findall(r'\P{L}\p{Lu}\p{Ll}*|\p{Lu}\p{Ll}*|\p{Ll}+|[0-9]+', row['Scripture'].strip().split(' ')[-2].split('-)')[-1])[0]
|
||||
else:
|
||||
df.at[index, 'Bookname'] = None # Replace 'new_value' with your desired modification
|
||||
|
||||
# Forward and backward fill Book
|
||||
df['Bookname'] = df['Bookname'].fillna(method='ffill')
|
||||
df['Bookname'] = df['Bookname'].fillna(method='bfill')
|
||||
|
||||
# Forward and backward fill Chapter
|
||||
df['Chapter'] = df['Chapter'].fillna(method='ffill')
|
||||
df['Chapter'] = df['Chapter'].fillna(method='bfill')
|
||||
|
||||
# Create a new column, named 'Book', which replaces the name in 'Bookname' with a number, starting at 40 for Matthew to Revelations, based on the set() of 'Bookname'
|
||||
for book in df['Bookname'].unique():
|
||||
df.loc[df['Bookname'] == book, 'Book'] = list(df['Bookname'].unique()).index(book) + 40
|
||||
|
||||
df.to_csv('/mnt/c/Projects/telosvertaling/telos.csv', index=False)
|
||||
|
||||
# write to sql database
|
||||
df.to_sql('telos', 'sqlite:///telos.db', if_exists='replace', index=False)
|
||||
|
||||
# with open('/mnt/c/Projects/telosvertaling/merged_entries.txt', 'w', encoding='utf-8') as f:
|
||||
# f.write(str([text for text in merged_entries]))
|
||||
|
||||
|
1
extracted_text.txt
Normal file
1
extracted_text.txt
Normal file
File diff suppressed because one or more lines are too long
1
merged_entries.txt
Normal file
1
merged_entries.txt
Normal file
@ -0,0 +1 @@
|
||||
[]
|
BIN
nieuwe-testament-telos-vertaling.jpg
Normal file
BIN
nieuwe-testament-telos-vertaling.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 76 KiB |
BIN
telos_nt.pdf
Normal file
BIN
telos_nt.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user