Added bing image search option

2026-02-06 10:47:15 +00:00 · 2022-09-10 01:01:05 +02:00 · 2022-09-10 01:01:05 +02:00 · d5dbba7e28
commit d5dbba7e28
parent 5c198f631b
6 changed files with 75 additions and 66 deletions
--- a/icrawler/builtin/baidu.py
+++ b/icrawler/builtin/baidu.py
@ -117,7 +117,7 @@ class BaiduParser(Parser):
        except:
            self.logger.error('Fail to parse the response in json format')
            return
-        for item in content['data']:
+        for item in content.get('data', ''):
            if 'objURL' in item:
                img_url = self._decode_url(item['objURL'])
            elif 'hoverURL' in item:
--- a/icrawler/builtin/bing.py
+++ b/icrawler/builtin/bing.py
@ -128,15 +128,12 @@ class BingParser(Parser):
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'), 'html5lib')
        image_divs = soup.find_all('div', class_='imgpt')
-        pattern = re.compile(r'murl\":\"(.*?)\.jpg')
        for div in image_divs:
-            href_str = html_parser.HTMLParser().unescape(div.a['m'])
-            match = pattern.search(href_str)
-            if match:
-                name = (match.group(1)
-                        if PY3 else match.group(1).encode('utf-8'))
-                img_url = '{}.jpg'.format(name)
-                yield dict(file_url=img_url)
+            try:
+                img_url = str(div).rsplit('"murl":"')[1].split('"')[0]
+            except IndexError:
+                continue
+            yield dict(file_url=img_url)


 class BingImageCrawler(Crawler):
--- a/icrawler/builtin/google.py
+++ b/icrawler/builtin/google.py
@ -151,15 +151,11 @@ class GoogleParser(Parser):
    def parse(self, response):
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'), 'html5lib')
-        data = str(soup).split('img alt=')
-        if len(data) < 2:
-            return []
+        images = soup.find_all(name='img')
        uris = []
-        for item in data[1:]:
-            try:
-                uris.append(item.split('src="')[1].split('"')[0])
-            except IndexError:
-                pass
+        for img in images:
+            if img.has_attr('src'):
+                uris.append(img['src'])
        return [{'file_url': uri} for uri in uris]


--- a/icrawler/crawler.py
+++ b/icrawler/crawler.py
@ -135,10 +135,7 @@ class Crawler(object):
        """
        if headers is None:
            headers = {
-                'User-Agent':
-                ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3)'
-                 ' AppleWebKit/537.36 (KHTML, like Gecko) '
-                 'Chrome/48.0.2564.116 Safari/537.36')
+                'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
            }
        elif not isinstance(headers, dict):
            raise TypeError('"headers" must be a dict object')
--- a/lazylibrarian/images.py
+++ b/lazylibrarian/images.py
@ -41,10 +41,13 @@ try:
    import PIL
    # noinspection PyUnresolvedReferences
    from PIL import Image as PILImage
-    from icrawler.builtin import GoogleImageCrawler
+    from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler, FlickrImageCrawler
 except ImportError:
    PIL = None
    GoogleImageCrawler = None
+    BingImageCrawler = None
+    BaiduImageCrawler = None
+    FlickrImageCrawler = None

 try:
    # noinspection PyProtectedMember
@ -542,48 +545,15 @@ def get_book_cover(bookid=None, src=None):
            if src:
                return None, src

-        if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
-            if PIL and safeparams:
-                icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
-                gc = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
-                logger.debug(safeparams)
-                logger.debug(icrawlerdir)
-                gc.crawl(keyword=safeparams, max_num=1)
-                if os.path.exists(icrawlerdir):
-                    res = len(os.listdir(icrawlerdir))
-                else:
-                    res = 0
-                logger.debug("Found %d %s" % (res, plural(res, 'image')))
-                if res:
-                    img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
-                    if src:
-                        coverlink, success, _ = cache_img("book", bookid + '_gb', img)
-                    else:
-                        coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
-                    data = ''
-                    coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
-                    if path_isfile(coverfile):
-                        with open(syspath(coverfile), 'rb') as f:
-                            data = f.read()
-                    if len(data) < 50:
-                        logger.debug('Got an empty google search image for %s [%s]' % (bookid, coverlink))
-                    elif success:
-                        logger.debug("Cached google search cover for %s %s" %
-                                     (item['AuthorName'], item['BookName']))
-                        rmtree(icrawlerdir, ignore_errors=True)
-                        return coverlink, 'google image'
-                    else:
-                        logger.debug("Error getting google image %s, [%s]" % (img, coverlink))
-                else:
-                    logger.debug("No images found in google page for %s" % bookid)
-                # rmtree(icrawlerdir, ignore_errors=True)
-            else:
-                if not PIL:
-                    logger.debug("PIL not found for google image search for %s" % bookid)
-                else:
-                    logger.debug("No parameters for google image search for %s" % bookid)
-            if src:
-                return None, src
+        if PIL and safeparams:
+            if src == 'baidu' or not src:
+                return crawl_image('baidu', src, cachedir, item, bookid, safeparams)
+            if src == 'bing' or not src:
+                return crawl_image('bing', src, cachedir, item, bookid, safeparams)
+            if src == 'flickr' or not src:
+                return crawl_image('flickr', src, cachedir, item, bookid, safeparams)
+            if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
+                return crawl_image('google', src, cachedir, item, bookid, safeparams)

        logger.debug("No image found from any configured source")
        return None, src
@ -592,6 +562,49 @@ def get_book_cover(bookid=None, src=None):
    return None, src


+def crawl_image(crawler_name, src, cachedir, item, bookid, safeparams):
+    icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
+    if crawler_name == 'baidu':
+        crawler = BaiduImageCrawler(storage={'root_dir': icrawlerdir})
+    elif crawler_name == 'bing':
+        crawler = BingImageCrawler(storage={'root_dir': icrawlerdir})
+    elif crawler_name == 'flickr':
+        crawler = FlickrImageCrawler(storage={'root_dir': icrawlerdir})
+    else:
+        crawler = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
+
+    crawler.crawl(keyword=safeparams, max_num=1)
+    if os.path.exists(icrawlerdir):
+        res = len(os.listdir(icrawlerdir))
+    else:
+        res = 0
+    logger.debug("Found %d %s" % (res, plural(res, 'image')))
+    if res:
+        img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
+        if src:
+            coverlink, success, _ = cache_img("book", bookid + '_' + crawler_name[:2], img)
+        else:
+            coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
+        data = ''
+        coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
+        if path_isfile(coverfile):
+            with open(syspath(coverfile), 'rb') as f:
+                data = f.read()
+        if len(data) < 50:
+            logger.debug('Got an empty %s search image for %s [%s]' % (crawler_name, bookid, coverlink))
+        elif success:
+            logger.debug("Cached %s search cover for %s %s" %
+                         (crawler_name, item['AuthorName'], item['BookName']))
+            rmtree(icrawlerdir, ignore_errors=True)
+            return coverlink, '%s image' % crawler_name
+        else:
+            logger.debug("Error getting %s image %s, [%s]" % (crawler_name, img, coverlink))
+    else:
+        logger.debug("No images found in %s page for %s" % (crawler_name, bookid))
+    # rmtree(icrawlerdir, ignore_errors=True)
+    return None, src
+
+
 def get_author_image(authorid=None, refresh=False, max_num=1):
    if not authorid:
        logger.error("get_author_image: No authorid")
@ -619,7 +632,13 @@ def get_author_image(authorid=None, refresh=False, max_num=1):
        if os.path.exists(icrawlerdir):
            res = len(os.listdir(icrawlerdir))
        else:
-            res = 0
+            # nothing from google, try bing
+            bc = BingImageCrawler(storage={'root_dir': icrawlerdir})
+            bc.crawl(keyword=safeparams, max_num=int(max_num))
+            if os.path.exists(icrawlerdir):
+                res = len(os.listdir(icrawlerdir))
+            else:
+                res = 0
        logger.debug("Found %d %s" % (res, plural(res, 'image')))
        if max_num == 1:
            if res:
--- a/lazylibrarian/webServe.py
+++ b/lazylibrarian/webServe.py
@ -3471,7 +3471,7 @@ class WebInterface(object):
                bookdata.pop('Narrator', None)
            covers = []
            sources = ['current', 'cover', 'goodreads', 'librarything', 'openlibrary',
-                       'googleisbn', 'googleimage']
+                       'googleisbn', 'bing', 'googleimage']  # flickr needs an apikey, baidu doesn't like bots
            if NEW_WHATWORK:
                sources.append('whatwork')
            for source in sources: