LazyLibrarian/lazylibrarian/cache.py

#  This file is part of Lazylibrarian.
#  Lazylibrarian is free software':'you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#  Lazylibrarian is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  You should have received a copy of the GNU General Public License
#  along with Lazylibrarian.  If not, see <http://www.gnu.org/licenses/>.

import abc
import http.client
import itertools
import json
import logging
import os
import shutil
import time
from abc import ABC
from enum import Enum
from http.client import responses
from typing import Any
from xml.etree import ElementTree

import requests

import lazylibrarian
from lazylibrarian import database
from lazylibrarian.blockhandler import BLOCKHANDLER
from lazylibrarian.common import get_user_agent, proxy_list
from lazylibrarian.config2 import CONFIG
from lazylibrarian.filesystem import (
    DIRS,
    listdir,
    path_isdir,
    path_isfile,
    remove_file,
    splitext,
    syspath,
)
from lazylibrarian.formatter import (
    check_int,
    make_bytestr,
    make_unicode,
    md5_utf8,
    plural,
    seconds_to_midnight,
    thread_name,
)


class ImageType(Enum):
    """ Types of images we cache in separate dirs """
    BOOK = 'book'
    AUTHOR = 'author'
    MAG = 'magazine'
    COMIC = 'comic'
    TEST = 'test'


service_blocked = ['goodreads', 'librarything', 'googleapis', 'openlibrary', 'hardcover', 'dnb', 'ISBN']


def gr_api_sleep():
    time_now = time.time()
    delay = time_now - lazylibrarian.TIMERS['LAST_GR']
    if delay < 1.0:
        sleep_time = 1.0 - delay
        lazylibrarian.TIMERS['SLEEP_GR'] += sleep_time
        cachelogger = logging.getLogger('special.cache')
        cachelogger.debug(f"GoodReads sleep {sleep_time:.3f}, total {lazylibrarian.TIMERS['SLEEP_GR']:.3f}")
        time.sleep(sleep_time)
    lazylibrarian.TIMERS['LAST_GR'] = time_now


def init_hex_caches() -> bool:
    """ Initialize the directory structure for each of the caches that use a two-layer dir structure for efficiency.
    Returns Success
    """
    logger = logging.getLogger()
    ok = True
    caches = ["WorkCache"]  # This one doesn't have its own handler class
    for cache in [HTMLCacheRequest, JSONCacheRequest, XMLCacheRequest]:
        caches.append(cache.cachedir_name())
    for item in caches:
        pth = DIRS.get_cachedir(item)
        subdirs = itertools.product("0123456789abcdef", repeat=2)
        for i, j in subdirs:
            cachelocation = os.path.join(pth, i, j)
            isok, msg = DIRS.ensure_dir_is_writeable(cachelocation)
            if not isok:
                logger.error(msg)
                ok = False
    return ok


def fetch_url(url: str, headers: dict | None = None, retry=True, timeout=True,
              raw: bool = False) -> (str | bytes, bool):
    """ Return the result of fetching a URL and True if success
        Otherwise return error message and False
        Return data as raw/bytes, if raw == True
        Default to unicode, need to set raw=True for images/data
        Allow one retry on timeout by default """
    logger = logging.getLogger(__name__)
    http.client.HTTPConnection.debuglevel = 1 if lazylibrarian.REQUESTSLOG else 0
    # for key in logging.Logger.manager.loggerDict:
    #     print(key)
    logging.getLogger('chardet').setLevel(logging.CRITICAL)
    logging.getLogger('urllib3.connectionpool').setLevel(logging.CRITICAL)

    url = make_unicode(url)

    for blk in service_blocked:
        if blk in url and BLOCKHANDLER.is_blocked(blk.split('.')[0]):
            return 'Blocked', False

    if headers is None:
        # some sites insist on having a user-agent, default is to add one
        # if you don't want any headers, pass headers={}
        headers = {'User-Agent': get_user_agent()}

    proxies = proxy_list()

    # jackett query all indexers needs a longer timeout
    # /torznab/all/api?q=  or v2.0/indexers/all/results/torznab/api?q=
    if timeout:
        if '/torznab/' in url and ('/all/' in url or '/aggregate/' in url):
            timeout = CONFIG.get_int('HTTP_EXT_TIMEOUT')
        else:
            timeout = CONFIG.get_int('HTTP_TIMEOUT')

    payload = {}
    if timeout:
        payload["timeout"] = timeout
    if proxies:
        payload["proxies"] = proxies
    verify = False
    if url.startswith('https') and CONFIG.get_bool('SSL_VERIFY'):
        verify = True
        if CONFIG['SSL_CERTS']:
            verify = CONFIG['SSL_CERTS']
    try:
        r = requests.get(url, verify=verify, params=payload, headers=headers)
    except requests.exceptions.TooManyRedirects as e:
        # This is to work around an oddity (bug??) with verified https goodreads requests
        # Goodreads sometimes redirects back to the same page in a loop using code 301,
        # and after a variable number of tries it might then return 200
        # but if it takes more than 30 loops the requests library stops trying
        # Retrying with verify off seems to clear it
        if not retry:
            logger.error(f"fetch_url: TooManyRedirects getting response from {url}")
            return f"TooManyRedirects {str(e)}", False
        logger.debug(f"Retrying - got TooManyRedirects on {url}")
        try:
            r = requests.get(url, verify=False, params=payload, headers=headers)
            logger.debug(f"TooManyRedirects retry status code {r.status_code}")
        except Exception as e:
            return f"Exception {type(e).__name__}: {str(e)}", False
    except requests.exceptions.Timeout as e:
        if not retry:
            logger.error(f"fetch_url: Timeout getting response from {url}")
            return f"Timeout {str(e)}", False
        logger.debug(f"fetch_url: retrying - got timeout on {url}")
        try:
            r = requests.get(url, verify=verify, params=payload, headers=headers)
        except Exception as e:
            return f"Exception {type(e).__name__}: {str(e)}", False
    except Exception as e:
        return f"Exception {type(e).__name__}: {str(e)}", False

    if str(r.status_code).startswith('2'):  # (200 OK etc)
        if raw:
            return r.content, True
        return r.text, True

    # we got an error response...
    # noinspection PyBroadException
    try:
        source = r.json()
        msg = source['error']['message']
    except Exception:
        msg = f"Error {r.status_code}"

    if '503' in msg:
        to_block = ''
        for blk in service_blocked:
            if blk in url:
                to_block = blk
                break
        if to_block:
            delay = 10
            logger.debug(f'Request denied, {r.status_code}, blocking {to_block} for {delay} seconds')
            BLOCKHANDLER.replace_provider_entry(to_block, delay, msg)
        else:
            logger.debug(f"Error {r.status_code} url={url}")

    elif 'googleapis' in url or 'google.com/search?q=ISBN' in url:
        prov = 'googleapis' if 'googleapis' in url else 'ISBN'
        if 'Limit Exceeded' in msg:
            # how long until midnight Pacific Time when google reset the quotas
            delay = seconds_to_midnight() + 28800  # PT is 8hrs behind UTC
            if delay > 86400:
                delay -= 86400  # no roll-over to next day
        elif r.status_code == 429:  # too many requests
            delay = 60
        else:
            # might be forbidden for a different reason where midnight might not matter
            # eg "Cannot determine user location for geographically restricted operation"
            delay = 3600

        logger.debug(f'Request denied, {r.status_code}, blocking {prov} for {delay} seconds: {msg}')
        BLOCKHANDLER.replace_provider_entry(prov, delay, msg)
    else:
        logger.debug(f"Error {r.status_code} url={url}")

    msg = responses.get(r.status_code, r.text)
    return f"Response status {r.status_code}: {msg}", False


def cache_img(img_type: ImageType, img_id: str, img_url: str, refresh=False) -> (str, bool, bool):
    """ Cache the image from the given filename or URL in the local images cache
        linked to the id.
        On success, return the link to the cached file, True, was_in_cache
        On error, return message, False, False """

    logger = logging.getLogger(__name__)
    logging.getLogger('urllib3.connectionpool').setLevel(logging.CRITICAL)
    had_cache = False
    cachefile = DIRS.get_cachefile(img_type.value, f"{img_id}.jpg")
    link = f'cache/{img_type.value}/{img_id}.jpg'
    if path_isfile(cachefile):
        if not refresh:  # overwrite any cached image
            cachelogger = logging.getLogger('special.cache')
            cachelogger.debug(f"Cached {img_type.name} image exists {cachefile}")
            return link, True, True
        had_cache = True

    if img_url.startswith('http'):
        result, success = fetch_url(img_url, raw=True)
        if success:
            try:
                with open(syspath(cachefile), 'wb') as img:
                    img.write(result)
                return link, True, False
            except Exception as e:
                logger.error(f"{type(e).__name__} writing image to {cachefile}, {str(e)}")
                logger.error(f"Image url: {img_url}")
                return str(e), False, False
        return result, False, False

    if not path_isfile(img_url) and img_url.endswith('.jpg'):
        # icrawler might give us jpg or png
        img_url = f"{img_url[:-4]}.png"
    if path_isfile(img_url):
        try:
            shutil.copyfile(img_url, cachefile)
            return link, True, had_cache
        except Exception as e:
            logger.error(f"{type(e).__name__} copying image to {cachefile}, {str(e)}")
            return str(e), False, False
    msg = f"No file [{img_url}]"
    logger.debug(msg)
    return msg, False, False


def gr_xml_request(my_url, use_cache=True, expire=True) -> (Any, bool):
    # respect goodreads api limit
    result, in_cache = XMLCacheRequest(url=my_url, use_cache=use_cache, expire=expire).get_cached_request()
    return result, in_cache


def json_request(my_url, use_cache=True, expire=True) -> (Any, bool):
    result, in_cache = JSONCacheRequest(url=my_url, use_cache=use_cache, expire=expire).get_cached_request()
    return result, in_cache


def html_request(my_url, use_cache=True, expire=True) -> (Any, bool):
    result, in_cache = HTMLCacheRequest(url=my_url, use_cache=use_cache, expire=expire).get_cached_request()
    return result, in_cache


class CacheRequest(ABC):
    """ Handle cache requests for LazyLibrarian. Use a concrete subclass of this """

    def __init__(self, url: str, use_cache: bool, expire: bool):
        self.url = url
        self.use_cache = use_cache
        self.expire = expire
        self.logger = logging.getLogger()
        self.cachelogger = logging.getLogger('special.cache')

    @classmethod
    @abc.abstractmethod
    def name(cls) -> str:
        """ Return the name of the cache, such as XML, HTML or JSON """

    @classmethod
    def cachedir_name(cls) -> str:
        return f"{cls.name()}Cache"

    @abc.abstractmethod
    def read_from_cache(self, hashfilename: str) -> (str, bool):
        """ Read the source from cache """

    def fetch_data(self) -> (str, bool):
        """ Fetch the data; called if it's not in the cache """
        return fetch_url(self.url, headers=None)

    @abc.abstractmethod
    def load_from_result_and_cache(self, result: str, filename: str, docache: bool) -> (str, bool):
        """ Load the value from result and store it in cache if docache is True """

    def get_cached_request(self) -> (Any, bool):
        # hashfilename = hash of url
        # if hashfilename exists in cache and isn't too old, return its contents
        # if not, read url and store the result in the cache
        # return the result, and boolean True if source was cache
        cache_location = DIRS.get_cachedir(self.cachedir_name())
        hashfilename, myhash = self.get_hashed_filename(cache_location)
        # CACHE_AGE is in days, so get it to seconds
        expire_older_than = CONFIG.get_int('CACHE_AGE') * 24 * 60 * 60 if self.expire else 0
        valid_cache = self.is_in_cache(expire_older_than, hashfilename, myhash)

        if valid_cache:
            lazylibrarian.CACHE_HIT += 1
            self.cachelogger.debug(f"CacheHandler: Returning CACHED response {hashfilename} for {self.url}")
            source, ok = self.read_from_cache(hashfilename)
            if not ok:
                self.logger.debug(f"CacheHandler: Failed to read {hashfilename} for {self.url}")
                return None, False
        else:
            lazylibrarian.CACHE_MISS += 1
            for blk in service_blocked:
                if blk in self.url and BLOCKHANDLER.is_blocked(blk):
                    return None, False

            result, success = self.fetch_data()
            if success:
                self.cachelogger.debug(f"CacheHandler: Storing {self.name()} {myhash} for {self.url}")
                source, result = self.load_from_result_and_cache(result, hashfilename, expire_older_than)
            elif '404' in result:  # don't block on "not found"
                return None, False
            else:
                msg = f"Got error response for {self.url}: {result.split('<')[0]}"
                self.logger.debug(msg)
                to_block = ''
                for blk in service_blocked:
                    if blk in self.url:
                        to_block = blk
                        break
                if to_block:
                    delay = 30
                    self.logger.debug(f'Blocking {to_block} for {delay} seconds')
                    BLOCKHANDLER.replace_provider_entry(to_block, delay, msg)
                return None, False
        return source, valid_cache

    def is_in_cache(self, expiry: int, hashfilename: str, myhash: str) -> bool:
        if self.use_cache and path_isfile(hashfilename):
            cache_modified_time = os.stat(hashfilename).st_mtime
            time_now = time.time()
            if self.expire and cache_modified_time < time_now - expiry:
                # Cache entry is too old, delete it
                cachelogger = logging.getLogger('special.cache')
                cachelogger.debug(f"Expiring {myhash}")
                os.remove(syspath(hashfilename))
                return False
            return True
        return False

    def get_hashed_filename(self, cache_location: str) -> (str, str):
        myhash = md5_utf8(self.url)
        hashfilename = os.path.join(cache_location, myhash[0], myhash[1], f"{myhash}.{self.name().lower()}")
        return hashfilename, myhash


class XMLCacheRequest(CacheRequest):
    @classmethod
    def name(cls) -> str:
        return "XML"

    def read_from_cache(self, hashfilename: str) -> (str, bool):
        with open(syspath(hashfilename), "rb") as cachefile:
            result = cachefile.read()
        source = None
        if result and result.startswith(b'<?xml'):
            try:
                source = ElementTree.fromstring(result)
            except UnicodeEncodeError:
                # seems sometimes the page contains utf-16 but the header says it's utf-8
                try:
                    result = result.decode('utf-16').encode('utf-8')
                    source = ElementTree.fromstring(result)
                except (ElementTree.ParseError, UnicodeEncodeError, UnicodeDecodeError):
                    self.logger.error(f"Error parsing xml from {hashfilename}")
                    source = None
            except ElementTree.ParseError:
                self.logger.error(f"Error parsing xml from {hashfilename}")
                source = None
        if source is None:
            self.logger.error(f"Error reading xml from {hashfilename}")
            # normally delete bad data, but keep for inspection if debug logging cache
            if not self.cachelogger.isEnabledFor(logging.DEBUG):
                remove_file(hashfilename)
            return None, False
        return source, True

    def fetch_data(self) -> (str, bool):
        gr_api_sleep()
        return fetch_url(self.url, raw=True, headers=None)

    def load_from_result_and_cache(self, result: str, filename: str, docache: bool) -> (str, bool):
        source = None
        result = make_bytestr(result)
        if result and result.startswith(b'<?xml'):
            try:
                source = ElementTree.fromstring(result)
                if not docache:
                    self.cachelogger.debug(f"Returning {len(source)} bytes xml uncached")
                    return source, False
            except UnicodeEncodeError:
                # sometimes we get utf-16 data labelled as utf-8
                try:
                    result = result.decode('utf-16').encode('utf-8')
                    source = ElementTree.fromstring(result)
                    if not docache:
                        self.cachelogger.debug(f"Returning {len(source)} bytes xml uncached")
                        return source, False
                except (ElementTree.ParseError, UnicodeEncodeError, UnicodeDecodeError):
                    self.logger.error(f"Error parsing xml from {self.url}")
                    source = None
            except ElementTree.ParseError:
                self.logger.error(f"Error parsing xml from {self.url}")
                source = None

        if source is not None:
            with open(syspath(filename), "wb") as cachefile:
                cachefile.write(result)
                self.cachelogger.debug(f"Cached {len(source)} bytes xml {filename}")
        else:
            self.logger.error(f"Error getting xml data from {self.url}")
            if result:
                self.logger.error(f"Result: {result[:80]}")
                with open(syspath(f"{filename}.err"), "wb") as cachefile:
                    cachefile.write(result)
                    self.logger.error(f"Cached {len(result)} bytes {filename}.err")
            return None, False
        return source, True


class HTMLCacheRequest(CacheRequest):
    @classmethod
    def name(cls) -> str:
        return "HTML"

    def read_from_cache(self, hashfilename: str) -> (str, bool):
        with open(syspath(hashfilename), "rb") as cachefile:
            source = cachefile.read()
        return source, True

    def load_from_result_and_cache(self, result: str, filename, docache) -> (str, bool):
        source = make_bytestr(result)
        with open(syspath(filename), "wb") as cachefile:
            cachefile.write(source)
        return source, True


class JSONCacheRequest(CacheRequest):
    @classmethod
    def name(cls) -> str:
        return "JSON"

    def read_from_cache(self, hashfilename: str) -> (str, bool):
        try:
            try:
                with open(hashfilename) as f:
                    source = json.load(f)
            finally:
                f.close()
        except ValueError:
            self.logger.error(f"Error decoding json from {hashfilename}")
            # normally delete bad data, but keep for inspection if debug logging cache
            # if not self.cachelogger.isEnabledFor(logging.DEBUG):
            remove_file(hashfilename)
            return None, False
        return source, True

    def load_from_result_and_cache(self, result: str, filename: str, docache) -> (str, bool):
        try:
            source = json.loads(result)
            if not docache:
                return source, False
        except Exception as e:
            self.logger.error(f"{type(e).__name__} decoding json from {self.url}")
            self.logger.debug(f"{e} : {result}")
            return None, False
        with open(filename, "w") as outfile:
            json.dump(source, outfile)
        return source, True


def clean_cache():
    """ Remove unused files from the cache - delete if expired or unused.
        Check JSONCache  WorkCache  XMLCache  SeriesCache Author  Book  Magazine  Comic  IRC
        Check covers and authorimages etc. referenced in the database exist
        and change database entry if missing, expire old pastissues table entries """

    threadname = thread_name()
    if "Thread" in threadname:
        thread_name("CLEANCACHE")

    logger = logging.getLogger(__name__)
    db = database.DBConnection()
    result = []
    try:
        db.upsert("jobs", {'Start': time.time()}, {'Name': 'CLEANCACHE'})
        result = [
            # Remove files that are too old from cache directories
            FileExpirer("IRCCache", False, check_int(lazylibrarian.IRC_CACHE_EXPIRY, 0)).clean(),
            FileExpirer("JSONCache", True, CONFIG.get_int('CACHE_AGE') * 24 * 60 * 60).clean(),
            FileExpirer("XMLCache", True, CONFIG.get_int('CACHE_AGE') * 24 * 60 * 60).clean(),

            # Remove files not referenced by relevant item in the DB
            OrphanCleaner("WorkCache", True, db, 'BookID', 'books', '%s', True).clean(),
            OrphanCleaner("SeriesCache", False, db, 'SeriesID', 'series', '%s', True).clean(),
            OrphanCleaner("magazine", False, db, 'cover', 'issues', 'cache/magazine/%s', False).clean(),

            # Remove files no longer referenced by the database
            UnreferencedCleaner("author", "Author cache", db, 'AuthorImg', 13,
                                "authors where instr(AuthorImg, 'cache/author/') = 1").clean(),
            UnreferencedCleaner("book", "Book cache", db, 'BookImg', 11,
                                "books where instr(BookImg, 'cache/book/') = 1").clean(),

            # At this point there should be no more .jpg files in the root of the cachedir
            # Any that are still there are for books/authors deleted from database
            ExtensionCleaner("root", ".jpg").clean(),

            # Verify the cover images referenced in the database are present, replace if not
            DBCleaner("book", "Cover", db, "books", "BookImg", "BookName", "BookID", 'images/nocover.png').clean(),
            DBCleaner("author", "Image", db, "authors", "AuthorImg", "AuthorName", "AuthorID",
                      'images/nophoto.png').clean(),
        ]

        expiry = CONFIG.get_int('CACHE_AGE')
        if expiry:
            time_now = time.time()
            too_old = time_now - (expiry * 24 * 60 * 60)
            # delete any pastissues table entries that are too old
            count = db.match('SELECT COUNT(*) as counter from pastissues')
            if count:
                total = count['counter']
            else:
                total = 0

            count = db.match("SELECT COUNT(*) as counter from pastissues WHERE Added>0 and Added<?", (too_old,))
            if count:
                old = count['counter']
            else:
                old = 0
            db.action("DELETE from pastissues WHERE Added>0 and Added<?", (too_old,))
            msg = f"Cleaned {old} old pastissues, kept {total - old}"
            result.append(msg)
            logger.debug(msg)
    except Exception as e:
        logger.error(str(e))

    db.upsert("jobs", {'Finish': time.time()}, {'Name': 'CLEANCACHE'})
    db.close()
    thread_name(threadname)
    return result


class CacheCleaner(ABC):
    def __init__(self, basedir: str):
        self.logger: logging.Logger = logging.getLogger()
        self.basedir: str = basedir
        self.cache: str = os.path.join(DIRS.CACHEDIR, basedir)
        self.cleaned: int = 0
        self.kept: int = 0

    @abc.abstractmethod
    def clean(self) -> str: pass


class FileCleaner(CacheCleaner):
    """ Cleaners that delete files from cache """

    def __init__(self, basedir: str, hexdirs: bool):
        super().__init__(basedir)
        self.hexdirs: bool = hexdirs

    def clean(self) -> str:
        """ Generic cleaning routine, iterates directories """
        if path_isdir(self.cache):
            if self.hexdirs:
                subdirs = itertools.product("0123456789abcdef", repeat=2)
                for i, j in subdirs:
                    dirname = os.path.join(self.cache, i, j)
                    for cached_file in listdir(dirname):
                        self.clean_file(os.path.join(dirname, cached_file))
            else:
                for cached_file in listdir(self.cache):
                    self.clean_file(os.path.join(self.cache, cached_file))
        msg = (f"Cleaned {self.cleaned} {self.name()} {plural(self.cleaned, 'file')} from {self.source()}, "
               f"kept {self.kept}")
        self.logger.debug(msg)
        return msg

    def remove_if(self, filename, condition):
        if condition:
            remove_file(filename)
            self.cleaned += 1
        else:
            self.kept += 1

    def source(self) -> str:
        return self.basedir

    @abc.abstractmethod
    def name(self) -> str:
        pass

    @abc.abstractmethod
    def clean_file(self, filename):
        pass


class FileExpirer(FileCleaner):
    """ Delete files in dirname that are older than expiry_seconds old.
    Return a string with a summary for printing. """

    def __init__(self, basedir: str, hexdirs: bool, expiry_sec: int):
        super().__init__(basedir, hexdirs)
        self.time_now = time.time()
        self.expiry_sec = expiry_sec

    def name(self) -> str:
        return 'expired'

    def clean_file(self, filename):
        if path_isfile(filename):
            cache_modified_time = os.stat(filename).st_mtime
            self.remove_if(filename, cache_modified_time < self.time_now - self.expiry_sec)


class ExtensionCleaner(FileCleaner):
    """ Delete all files in basedir with the right extension """

    def __init__(self, basedir: str, ext: str):
        super().__init__(basedir, False)
        self.ext = ext

    def name(self):
        return 'superfluous'

    def clean_file(self, filename):
        if self.ext:
            self.remove_if(filename, filename.endswith(self.ext))


class OrphanCleaner(FileCleaner):
    """ Delete files in dirname that don't have a corresponding entry in the DB,
    where the ID is the filename without the extension.
    Return a string with a summary for printing. """

    def __init__(self, basedir: str, hexdirs: bool, db, field: str, table: str, matcher: str, dotsplit: bool):
        super().__init__(basedir, hexdirs)
        self.db = db
        self.field = field
        self.table = table
        self.matcher = matcher
        self.dotsplit = dotsplit

    def name(self) -> str:
        return 'orphan'

    def getid(self, filename) -> str:
        name = os.path.basename(filename)
        if self.dotsplit:
            return name.split('.')[0]
        # Magazines use a different encoding scheme in the file name.
        fname, extn = splitext(name)
        return fname.split('_')[0] + extn

    def clean_file(self, filename):
        try:
            dbid = self.getid(filename)
            query = f'select {self.field} from {self.table} where {self.field}=?'
            match = self.matcher % dbid
            item = self.db.match(query, (match,))
            self.remove_if(filename, not item)
        except IndexError:
            self.logger.error(f'Clean Cache: Error splitting {filename}')


class UnreferencedCleaner(FileCleaner):
    """ Delete files that are no longer referenced in the database """

    def __init__(self, basedir: str, taskname: str, db, field: str, fieldcut: int, query: str):
        super().__init__(basedir, False)
        self.taskname = taskname
        self.db = db
        fullquery = f'SELECT {field} from {query}'
        res = db.select(fullquery)
        self.items = [item[field][fieldcut:] for item in res]
        self.logger.debug(f"Checking {len(self.items)} {field} images")

    def name(self) -> str:
        return 'orphan'

    def source(self) -> str:
        return self.taskname

    def clean_file(self, filename):
        name = os.path.basename(filename)
        self.remove_if(filename, name not in self.items)


class DBCleaner(CacheCleaner):
    """ Where the database refers to an image that no longer exists, replace it
    with the fallback image """

    def __init__(self, basedir: str, typestr: str, db, table: str, fimg: str, fname: str, fid: str, fallback: str):
        super().__init__(basedir)
        self.typestr = typestr
        self.db = db
        self.table = table
        self.fimg = fimg
        self.fname = fname
        self.fid = fid
        self.fallback = fallback
        query = f'SELECT {fimg},{fname},{fid} from {table}'
        self.items = db.select(query)

    def clean(self) -> str:
        for item in self.items:
            keep = True
            imgfile = ''
            if item[self.fimg] is None or item[self.fimg] == '':
                keep = False
            if keep and not item[self.fimg].startswith('http') and item[self.fimg] != self.fallback:
                # html uses '/' as separator, but os might not
                imgname = item[self.fimg].rsplit('/')[-1]
                imgfile = os.path.join(self.cache, imgname)
                if not path_isfile(imgfile):
                    keep = False
            if keep:
                self.kept += 1
            else:
                self.cleaned += 1
                self.logger.debug(f'{self.typestr} missing for {item[self.fname]} {imgfile}')
                updatequery = f"update {self.table} set {self.fimg}='{self.fallback}' where {self.fid}=?"
                self.db.action(updatequery, (item[self.fid],))

        msg = f"Cleaned {self.cleaned} missing {plural(self.cleaned, self.typestr)}, kept {self.kept}"
        self.logger.debug(msg)
        return msg