Parse XMP metadata in separate function, add exception, try multiple metadata formats

2024-09-21 21:52:29 +00:00 · 2021-03-16 17:53:33 +01:00 · 2021-03-16 17:53:33 +01:00 · 8abfaf0ffd
commit 8abfaf0ffd
parent 33e352819c
1 changed files with 68 additions and 33 deletions
--- a/cps/uploader.py
+++ b/cps/uploader.py
@ -22,11 +22,10 @@ import hashlib
 from tempfile import gettempdir
 from flask_babel import gettext as _

-from . import logger, comic, isoLanguages
+from . import logger, comic, isoLanguages, get_locale
 from .constants import BookMeta
 from .helper import split_authors

-
 log = logger.create()


@ -82,7 +81,7 @@ def process(tmp_file_path, original_file_name, original_file_extension, rarExecu
                                        original_file_name,
                                        original_file_extension,
                                        rarExecutable)
-    except Exception as ex:
+    except Exception as ex:      
        log.warning('cannot parse metadata, using default: %s', ex)

    if meta and meta.title.strip() and meta.author.strip():
@ -106,59 +105,95 @@ def default_meta(tmp_file_path, original_file_name, original_file_extension):
        languages="")


-def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
-    doc_info = None
-    if use_pdf_meta:
-        with open(tmp_file_path, 'rb') as f:
-            pdf_file = PdfFileReader(f)
-            doc_info = pdf_file.getDocumentInfo()
-            xmp_info = pdf_file.getXmpMetadata() 
+def parse_xmp(pdf_file):
+    """
+    Parse XMP Metadata and prepare for BookMeta object 
+    """
+    try:
+        xmp_info = pdf_file.getXmpMetadata()
+    except Exception as e:
+        log.debug('Can not read XMP metadata', e)
+        return None
+
    if xmp_info:
-        xmp_author = xmp_info.dc_creator
+        try:
+            xmp_author = xmp_info.dc_creator # list
+        except:
+            xmp_author = ['Unknown']
+        
        if xmp_info.dc_title: 
            xmp_title = xmp_info.dc_title['x-default']
        else:
            xmp_title = ''
+
        if xmp_info.dc_description:
            xmp_description = xmp_info.dc_description['x-default']
        else:
            xmp_description = ''
-        xmp_tags = ', '.join(xmp_info.dc_subject)
-        xmp_language = xmp_info.dc_language[0]
-        xmp_publisher = ', '.join(xmp_info.dc_publisher)

-    if xmp_info or doc_info:
-        author = xmp_author or split_authors([doc_info.author]) or u'Unknown'
-        title = xmp_title or doc_info.title or original_file_name
-        subject = xmp_description or doc_info.subject
-        publisher = xmp_publisher
-        tags = xmp_tags or doc_info['/Keywords'] 
-        if xmp_language :
-            lang = xmp_language.split('-', 1)[0].lower()
-            if len(lang) == 2:
-                language = isoLanguages.get(part1=lang).name
-            elif len(lang) == 3:
-                language = isoLanguages.get(part3=lang).name
-            else:
-                language = ''
-        else:
-            language = ''
+        languages = []
+        for i in xmp_info.dc_language:
+            #calibre-web currently only takes one language.
+            languages.append(isoLanguages.get_lang3(i))
+        
+        xmp_tags = ', '.join(xmp_info.dc_subject)
+        xmp_publisher = ', '.join(xmp_info.dc_publisher)
+        xmp_languages = xmp_info.dc_language
+
+        return {'author': xmp_author,
+                    'title': xmp_title,
+                    'subject': xmp_description,
+                    'tags': xmp_tags, 'languages': languages,
+                    'publisher': xmp_publisher
+                    }
+
+
+def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
+    doc_info = None
+    xmp_info = None
+
+    if use_pdf_meta:
+        with open(tmp_file_path, 'rb') as f:
+            pdf_file = PdfFileReader(f)
+            doc_info = pdf_file.getDocumentInfo()
+            xmp_info = parse_xmp(pdf_file)
+
+    if xmp_info:
+        author = ' & '.join(split_authors(xmp_info['author']))
+        title = xmp_info['title']
+        subject = xmp_info['subject']
+        tags = xmp_info['tags']
+        languages = xmp_info['languages']
+        publisher = xmp_info['publisher']
+
+    elif doc_info:
+        author = ' & '.join(split_authors([doc_info.author]))
+        title = doc_info.title
+        subject = doc_info.subject
+        tags = doc_info['/Keywords']
+        languages = ""
+        publisher = ""
+
    else:
-        author = u'Unknown'
+        author= u'Unknown'
        title = original_file_name
        subject = ""
+        tags = ""
+        languages = ""
+        publisher = ""

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
-        author=' & '.join(author),
+        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags=tags,
        series="",
        series_id="",
-        languages=language)
+        languages=', '.join(languages)
+        )


 def pdf_preview(tmp_file_path, tmp_dir):