Added bing image search option

This commit is contained in:
Phil Borman 2022-09-10 01:01:05 +02:00
parent 5c198f631b
commit d5dbba7e28
6 changed files with 75 additions and 66 deletions

View File

@ -117,7 +117,7 @@ class BaiduParser(Parser):
except:
self.logger.error('Fail to parse the response in json format')
return
for item in content['data']:
for item in content.get('data', ''):
if 'objURL' in item:
img_url = self._decode_url(item['objURL'])
elif 'hoverURL' in item:

View File

@ -128,15 +128,12 @@ class BingParser(Parser):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'html5lib')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
name = (match.group(1)
if PY3 else match.group(1).encode('utf-8'))
img_url = '{}.jpg'.format(name)
yield dict(file_url=img_url)
try:
img_url = str(div).rsplit('"murl":"')[1].split('"')[0]
except IndexError:
continue
yield dict(file_url=img_url)
class BingImageCrawler(Crawler):

View File

@ -151,15 +151,11 @@ class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'html5lib')
data = str(soup).split('img alt=')
if len(data) < 2:
return []
images = soup.find_all(name='img')
uris = []
for item in data[1:]:
try:
uris.append(item.split('src="')[1].split('"')[0])
except IndexError:
pass
for img in images:
if img.has_attr('src'):
uris.append(img['src'])
return [{'file_url': uri} for uri in uris]

View File

@ -135,10 +135,7 @@ class Crawler(object):
"""
if headers is None:
headers = {
'User-Agent':
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3)'
' AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36')
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
elif not isinstance(headers, dict):
raise TypeError('"headers" must be a dict object')

View File

@ -41,10 +41,13 @@ try:
import PIL
# noinspection PyUnresolvedReferences
from PIL import Image as PILImage
from icrawler.builtin import GoogleImageCrawler
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler, FlickrImageCrawler
except ImportError:
PIL = None
GoogleImageCrawler = None
BingImageCrawler = None
BaiduImageCrawler = None
FlickrImageCrawler = None
try:
# noinspection PyProtectedMember
@ -542,48 +545,15 @@ def get_book_cover(bookid=None, src=None):
if src:
return None, src
if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
if PIL and safeparams:
icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
gc = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
logger.debug(safeparams)
logger.debug(icrawlerdir)
gc.crawl(keyword=safeparams, max_num=1)
if os.path.exists(icrawlerdir):
res = len(os.listdir(icrawlerdir))
else:
res = 0
logger.debug("Found %d %s" % (res, plural(res, 'image')))
if res:
img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
if src:
coverlink, success, _ = cache_img("book", bookid + '_gb', img)
else:
coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
data = ''
coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
if path_isfile(coverfile):
with open(syspath(coverfile), 'rb') as f:
data = f.read()
if len(data) < 50:
logger.debug('Got an empty google search image for %s [%s]' % (bookid, coverlink))
elif success:
logger.debug("Cached google search cover for %s %s" %
(item['AuthorName'], item['BookName']))
rmtree(icrawlerdir, ignore_errors=True)
return coverlink, 'google image'
else:
logger.debug("Error getting google image %s, [%s]" % (img, coverlink))
else:
logger.debug("No images found in google page for %s" % bookid)
# rmtree(icrawlerdir, ignore_errors=True)
else:
if not PIL:
logger.debug("PIL not found for google image search for %s" % bookid)
else:
logger.debug("No parameters for google image search for %s" % bookid)
if src:
return None, src
if PIL and safeparams:
if src == 'baidu' or not src:
return crawl_image('baidu', src, cachedir, item, bookid, safeparams)
if src == 'bing' or not src:
return crawl_image('bing', src, cachedir, item, bookid, safeparams)
if src == 'flickr' or not src:
return crawl_image('flickr', src, cachedir, item, bookid, safeparams)
if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
return crawl_image('google', src, cachedir, item, bookid, safeparams)
logger.debug("No image found from any configured source")
return None, src
@ -592,6 +562,49 @@ def get_book_cover(bookid=None, src=None):
return None, src
def crawl_image(crawler_name, src, cachedir, item, bookid, safeparams):
icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
if crawler_name == 'baidu':
crawler = BaiduImageCrawler(storage={'root_dir': icrawlerdir})
elif crawler_name == 'bing':
crawler = BingImageCrawler(storage={'root_dir': icrawlerdir})
elif crawler_name == 'flickr':
crawler = FlickrImageCrawler(storage={'root_dir': icrawlerdir})
else:
crawler = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
crawler.crawl(keyword=safeparams, max_num=1)
if os.path.exists(icrawlerdir):
res = len(os.listdir(icrawlerdir))
else:
res = 0
logger.debug("Found %d %s" % (res, plural(res, 'image')))
if res:
img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
if src:
coverlink, success, _ = cache_img("book", bookid + '_' + crawler_name[:2], img)
else:
coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
data = ''
coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
if path_isfile(coverfile):
with open(syspath(coverfile), 'rb') as f:
data = f.read()
if len(data) < 50:
logger.debug('Got an empty %s search image for %s [%s]' % (crawler_name, bookid, coverlink))
elif success:
logger.debug("Cached %s search cover for %s %s" %
(crawler_name, item['AuthorName'], item['BookName']))
rmtree(icrawlerdir, ignore_errors=True)
return coverlink, '%s image' % crawler_name
else:
logger.debug("Error getting %s image %s, [%s]" % (crawler_name, img, coverlink))
else:
logger.debug("No images found in %s page for %s" % (crawler_name, bookid))
# rmtree(icrawlerdir, ignore_errors=True)
return None, src
def get_author_image(authorid=None, refresh=False, max_num=1):
if not authorid:
logger.error("get_author_image: No authorid")
@ -619,7 +632,13 @@ def get_author_image(authorid=None, refresh=False, max_num=1):
if os.path.exists(icrawlerdir):
res = len(os.listdir(icrawlerdir))
else:
res = 0
# nothing from google, try bing
bc = BingImageCrawler(storage={'root_dir': icrawlerdir})
bc.crawl(keyword=safeparams, max_num=int(max_num))
if os.path.exists(icrawlerdir):
res = len(os.listdir(icrawlerdir))
else:
res = 0
logger.debug("Found %d %s" % (res, plural(res, 'image')))
if max_num == 1:
if res:

View File

@ -3471,7 +3471,7 @@ class WebInterface(object):
bookdata.pop('Narrator', None)
covers = []
sources = ['current', 'cover', 'goodreads', 'librarything', 'openlibrary',
'googleisbn', 'googleimage']
'googleisbn', 'bing', 'googleimage'] # flickr needs an apikey, baidu doesn't like bots
if NEW_WHATWORK:
sources.append('whatwork')
for source in sources: