mirror of
https://gitlab.com/LazyLibrarian/LazyLibrarian.git
synced 2026-02-06 10:47:15 +00:00
Added bing image search option
This commit is contained in:
parent
5c198f631b
commit
d5dbba7e28
@ -117,7 +117,7 @@ class BaiduParser(Parser):
|
||||
except:
|
||||
self.logger.error('Fail to parse the response in json format')
|
||||
return
|
||||
for item in content['data']:
|
||||
for item in content.get('data', ''):
|
||||
if 'objURL' in item:
|
||||
img_url = self._decode_url(item['objURL'])
|
||||
elif 'hoverURL' in item:
|
||||
|
||||
@ -128,15 +128,12 @@ class BingParser(Parser):
|
||||
soup = BeautifulSoup(
|
||||
response.content.decode('utf-8', 'ignore'), 'html5lib')
|
||||
image_divs = soup.find_all('div', class_='imgpt')
|
||||
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
|
||||
for div in image_divs:
|
||||
href_str = html_parser.HTMLParser().unescape(div.a['m'])
|
||||
match = pattern.search(href_str)
|
||||
if match:
|
||||
name = (match.group(1)
|
||||
if PY3 else match.group(1).encode('utf-8'))
|
||||
img_url = '{}.jpg'.format(name)
|
||||
yield dict(file_url=img_url)
|
||||
try:
|
||||
img_url = str(div).rsplit('"murl":"')[1].split('"')[0]
|
||||
except IndexError:
|
||||
continue
|
||||
yield dict(file_url=img_url)
|
||||
|
||||
|
||||
class BingImageCrawler(Crawler):
|
||||
|
||||
@ -151,15 +151,11 @@ class GoogleParser(Parser):
|
||||
def parse(self, response):
|
||||
soup = BeautifulSoup(
|
||||
response.content.decode('utf-8', 'ignore'), 'html5lib')
|
||||
data = str(soup).split('img alt=')
|
||||
if len(data) < 2:
|
||||
return []
|
||||
images = soup.find_all(name='img')
|
||||
uris = []
|
||||
for item in data[1:]:
|
||||
try:
|
||||
uris.append(item.split('src="')[1].split('"')[0])
|
||||
except IndexError:
|
||||
pass
|
||||
for img in images:
|
||||
if img.has_attr('src'):
|
||||
uris.append(img['src'])
|
||||
return [{'file_url': uri} for uri in uris]
|
||||
|
||||
|
||||
|
||||
@ -135,10 +135,7 @@ class Crawler(object):
|
||||
"""
|
||||
if headers is None:
|
||||
headers = {
|
||||
'User-Agent':
|
||||
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3)'
|
||||
' AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/48.0.2564.116 Safari/537.36')
|
||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
|
||||
}
|
||||
elif not isinstance(headers, dict):
|
||||
raise TypeError('"headers" must be a dict object')
|
||||
|
||||
@ -41,10 +41,13 @@ try:
|
||||
import PIL
|
||||
# noinspection PyUnresolvedReferences
|
||||
from PIL import Image as PILImage
|
||||
from icrawler.builtin import GoogleImageCrawler
|
||||
from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler, FlickrImageCrawler
|
||||
except ImportError:
|
||||
PIL = None
|
||||
GoogleImageCrawler = None
|
||||
BingImageCrawler = None
|
||||
BaiduImageCrawler = None
|
||||
FlickrImageCrawler = None
|
||||
|
||||
try:
|
||||
# noinspection PyProtectedMember
|
||||
@ -542,48 +545,15 @@ def get_book_cover(bookid=None, src=None):
|
||||
if src:
|
||||
return None, src
|
||||
|
||||
if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
|
||||
if PIL and safeparams:
|
||||
icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
|
||||
gc = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
logger.debug(safeparams)
|
||||
logger.debug(icrawlerdir)
|
||||
gc.crawl(keyword=safeparams, max_num=1)
|
||||
if os.path.exists(icrawlerdir):
|
||||
res = len(os.listdir(icrawlerdir))
|
||||
else:
|
||||
res = 0
|
||||
logger.debug("Found %d %s" % (res, plural(res, 'image')))
|
||||
if res:
|
||||
img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
|
||||
if src:
|
||||
coverlink, success, _ = cache_img("book", bookid + '_gb', img)
|
||||
else:
|
||||
coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
|
||||
data = ''
|
||||
coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
|
||||
if path_isfile(coverfile):
|
||||
with open(syspath(coverfile), 'rb') as f:
|
||||
data = f.read()
|
||||
if len(data) < 50:
|
||||
logger.debug('Got an empty google search image for %s [%s]' % (bookid, coverlink))
|
||||
elif success:
|
||||
logger.debug("Cached google search cover for %s %s" %
|
||||
(item['AuthorName'], item['BookName']))
|
||||
rmtree(icrawlerdir, ignore_errors=True)
|
||||
return coverlink, 'google image'
|
||||
else:
|
||||
logger.debug("Error getting google image %s, [%s]" % (img, coverlink))
|
||||
else:
|
||||
logger.debug("No images found in google page for %s" % bookid)
|
||||
# rmtree(icrawlerdir, ignore_errors=True)
|
||||
else:
|
||||
if not PIL:
|
||||
logger.debug("PIL not found for google image search for %s" % bookid)
|
||||
else:
|
||||
logger.debug("No parameters for google image search for %s" % bookid)
|
||||
if src:
|
||||
return None, src
|
||||
if PIL and safeparams:
|
||||
if src == 'baidu' or not src:
|
||||
return crawl_image('baidu', src, cachedir, item, bookid, safeparams)
|
||||
if src == 'bing' or not src:
|
||||
return crawl_image('bing', src, cachedir, item, bookid, safeparams)
|
||||
if src == 'flickr' or not src:
|
||||
return crawl_image('flickr', src, cachedir, item, bookid, safeparams)
|
||||
if src == 'googleimage' or not src and lazylibrarian.CONFIG['IMP_GOOGLEIMAGE']:
|
||||
return crawl_image('google', src, cachedir, item, bookid, safeparams)
|
||||
|
||||
logger.debug("No image found from any configured source")
|
||||
return None, src
|
||||
@ -592,6 +562,49 @@ def get_book_cover(bookid=None, src=None):
|
||||
return None, src
|
||||
|
||||
|
||||
def crawl_image(crawler_name, src, cachedir, item, bookid, safeparams):
|
||||
icrawlerdir = os.path.join(cachedir, 'icrawler', bookid)
|
||||
if crawler_name == 'baidu':
|
||||
crawler = BaiduImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
elif crawler_name == 'bing':
|
||||
crawler = BingImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
elif crawler_name == 'flickr':
|
||||
crawler = FlickrImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
else:
|
||||
crawler = GoogleImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
|
||||
crawler.crawl(keyword=safeparams, max_num=1)
|
||||
if os.path.exists(icrawlerdir):
|
||||
res = len(os.listdir(icrawlerdir))
|
||||
else:
|
||||
res = 0
|
||||
logger.debug("Found %d %s" % (res, plural(res, 'image')))
|
||||
if res:
|
||||
img = os.path.join(icrawlerdir, os.listdir(icrawlerdir)[0])
|
||||
if src:
|
||||
coverlink, success, _ = cache_img("book", bookid + '_' + crawler_name[:2], img)
|
||||
else:
|
||||
coverlink, success, _ = cache_img("book", bookid, img, refresh=True)
|
||||
data = ''
|
||||
coverfile = os.path.join(lazylibrarian.DATADIR, coverlink)
|
||||
if path_isfile(coverfile):
|
||||
with open(syspath(coverfile), 'rb') as f:
|
||||
data = f.read()
|
||||
if len(data) < 50:
|
||||
logger.debug('Got an empty %s search image for %s [%s]' % (crawler_name, bookid, coverlink))
|
||||
elif success:
|
||||
logger.debug("Cached %s search cover for %s %s" %
|
||||
(crawler_name, item['AuthorName'], item['BookName']))
|
||||
rmtree(icrawlerdir, ignore_errors=True)
|
||||
return coverlink, '%s image' % crawler_name
|
||||
else:
|
||||
logger.debug("Error getting %s image %s, [%s]" % (crawler_name, img, coverlink))
|
||||
else:
|
||||
logger.debug("No images found in %s page for %s" % (crawler_name, bookid))
|
||||
# rmtree(icrawlerdir, ignore_errors=True)
|
||||
return None, src
|
||||
|
||||
|
||||
def get_author_image(authorid=None, refresh=False, max_num=1):
|
||||
if not authorid:
|
||||
logger.error("get_author_image: No authorid")
|
||||
@ -619,7 +632,13 @@ def get_author_image(authorid=None, refresh=False, max_num=1):
|
||||
if os.path.exists(icrawlerdir):
|
||||
res = len(os.listdir(icrawlerdir))
|
||||
else:
|
||||
res = 0
|
||||
# nothing from google, try bing
|
||||
bc = BingImageCrawler(storage={'root_dir': icrawlerdir})
|
||||
bc.crawl(keyword=safeparams, max_num=int(max_num))
|
||||
if os.path.exists(icrawlerdir):
|
||||
res = len(os.listdir(icrawlerdir))
|
||||
else:
|
||||
res = 0
|
||||
logger.debug("Found %d %s" % (res, plural(res, 'image')))
|
||||
if max_num == 1:
|
||||
if res:
|
||||
|
||||
@ -3471,7 +3471,7 @@ class WebInterface(object):
|
||||
bookdata.pop('Narrator', None)
|
||||
covers = []
|
||||
sources = ['current', 'cover', 'goodreads', 'librarything', 'openlibrary',
|
||||
'googleisbn', 'googleimage']
|
||||
'googleisbn', 'bing', 'googleimage'] # flickr needs an apikey, baidu doesn't like bots
|
||||
if NEW_WHATWORK:
|
||||
sources.append('whatwork')
|
||||
for source in sources:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user