nft-deal-score/viz/upload_solana_nft_labels.py
flipside-kellen 8f8820d2f7 automated
2022-07-14 15:13:46 -07:00

655 lines
25 KiB
Python

import re
import os
import json
import time
# import math
import requests
import pandas as pd
# import urllib.request
import snowflake.connector
from time import sleep
# from solana_model import just_float
# from utils import clean_name, clean_token_id, format_num, merge
############################
# Define Constants #
############################
BASE_PATH = '/home/data-science'
DATA_FOLDER = '/rstudio-data/nft_labels'
RPC = 'https://red-cool-wildflower.solana-mainnet.quiknode.pro/a1674d4ab875dd3f89b34863a86c0f1931f57090/'
##############################
# Load DB Connection #
##############################
with open('{}/data_science/util/snowflake.pwd'.format(BASE_PATH), 'r') as f:
pwd = f.readlines()[0].strip()
with open('{}/data_science/util/snowflake.usr'.format(BASE_PATH), 'r') as f:
usr = f.readlines()[0].strip()
ctx = snowflake.connector.connect(
user=usr,
password=pwd,
account='vna27887.us-east-1'
)
############################
# Helper Functions #
############################
def read_csv(data_folder, fname):
return(pd.read_csv('{}/{}.csv'.format(data_folder, fname)))
def write_csv(data_folder, fname, df, verbose = True):
df.to_csv('{}/{}.csv'.format(data_folder, fname), index=False)
if verbose:
print('Wrote {} rows to {}'.format(len(df), fname))
def clean_colnames(df):
names = [ x.lower() for x in df.columns ]
df.columns = names
return(df)
def clean_collection_name(x):
x = re.sub('\|', '-', x).strip()
x = re.sub('\)', '', x).strip()
x = re.sub('\(', '', x).strip()
x = re.sub('\'', '', x).strip()
return(x)
def merge(left, right, on=None, how='inner', ensure=True, verbose=True):
df = left.merge(right, on=on, how=how)
if len(df) != len(left) and (ensure or verbose):
print('{} -> {}'.format(len(left), len(df)))
cur = left.merge(right, on=on, how='left')
cols = set(right.columns).difference(set(left.columns))
print(cols)
col = list(cols)[0]
missing = cur[cur[col].isnull()]
print(missing.head())
if ensure:
assert(False)
return(df)
def Convert(tup, di):
di = dict(tup)
return di
####################################
# Metadata From HowRare.Is #
####################################
def how_rare_is_api():
query = '''
SELECT DISTINCT LOWER(project_name) AS lower_collection
FROM solana.core.dim_nft_metadata
'''
df = ctx.cursor().execute(query)
df = pd.DataFrame.from_records(iter(df), columns=[x[0] for x in df.description])
url = 'https://api.howrare.is/v0.1/collections'
r = requests.get(url)
j = r.json()
c_df = pd.DataFrame(j['result']['data']).sort_values('floor_marketcap', ascending=0)
c_df['lower_collection'] = c_df.url.apply(lambda x: x.lower().strip() )
seen = sorted(df.LOWER_COLLECTION.apply(lambda x: re.sub(' |_|\'', '', x) ).values)
c_df['seen_1'] = c_df.url.apply(lambda x: re.sub(' |_|\'', '', x[1:]).lower() in seen ).astype(int)
c_df['seen_2'] = c_df.name.apply(lambda x: re.sub(' |_|\'', '', x).lower() in seen ).astype(int)
c_df['seen'] = (c_df.seen_1 + c_df.seen_2 > 0).astype(int)
seen = seen + [ 'smb','aurory','degenapes','thugbirdz','degods','okay_bears','catalinawhalemixer','cetsoncreck','stonedapecrew','solgods' ]
c_df = c_df[-(c_df.url.isin([ '/'+x for x in seen]))]
c_df = c_df[c_df.seen == 0]
it = 0
tot = len(c_df)
m_data = []
print('Pulling metadata for {} collections'.format(tot))
for row in c_df.iterrows():
it += 1
row = row[1]
collection = row['name']
print('#{} / {}: {}'.format(it, tot, collection))
url = row['url'][1:]
if it > 1:
assert(len(m_data))
url = 'https://api.howrare.is/v0.1/collections/'+url
r = requests.get(url)
j = r.json()
n_errors = 0
for i in j['result']['data']['items']:
try:
token_id = int(i['id'])
mint = i['mint']
image = i['image']
for d in i['attributes']:
d['token_id'] = token_id
d['collection'] = collection
d['mint_address'] = mint
d['image_url'] = image
m_data += [ d ]
except:
# print('Error')
n_errors += 1
pass
if n_errors:
print('{} errors'.format(n_errors))
metadata = pd.DataFrame(m_data).rename(columns={'name':'feature_name', 'value':'feature_value'})
write_csv(DATA_FOLDER, 'howrare_labels', metadata[['collection','mint_address']])
a = metadata.groupby(['collection','mint_address','token_id','image_url'])[[ 'feature_name','feature_value' ]].apply(lambda g: Convert(list(map(tuple, g.values.tolist())), {}) ).reset_index()
a.columns = ['collection','mint_address','token_id','image_url', 'token_metadata']
a['commission_rate'] = None
a['contract_address'] = a.mint_address
a['contract_name'] = a.collection
a['created_at_block_id'] = 0
a['created_at_timestamp'] = '2021-01-01'
a['created_at_tx_id'] = ''
a['creator_address'] = a.mint_address
a['creator_name'] = a.collection
a['project_name'] = a.collection
a['token_metadata_uri'] = a.image_url
a['token_name'] = a.collection
a['n'] = range(len(a))
a['n'] = a.n.apply(lambda x: int(x/50) )
a['token_id'] = a.token_id.astype(int)
# remove existing files
fnames = os.listdir(DATA_FOLDER+'/metadata/results/')
print('fnames')
print(fnames)
for f in fnames:
os.remove(DATA_FOLDER+'/metadata/results/'+f)
# write new metadata incrementally to upload to solana.core.dim_nft_metadata
n = 100000
tot = int(len(a) / n) + 1
for i in range(0, len(a), n):
ind = int(i/n)
print('#{} / {}'.format(ind, tot))
g = a.head(i+n).tail(n).to_dict('records')
txt = [
{
"model": {
"blockchain": "solana",
"sinks": [
{
"destination": "{database_name}.silver.nft_metadata",
"type": "snowflake",
"unique_key": "blockchain || contract_address || token_id"
}
],
},
"results": g[x:x+50]
}
for x in range(0, len(g), 50)
]
w = pd.DataFrame({'ind': range(len(txt)), 'results':[json.dumps(x) for x in txt] })
write_csv( DATA_FOLDER, 'metadata/results/{}'.format(ind), w )
return
#################################
# Load Data From ME API #
#################################
def mints_from_me():
##################################
# Get All ME Collections #
##################################
headers = {
# 'Authorization': 'Bearer 9c39e05c-db3c-4f3f-ac48-84099111b813'
}
data = []
has_more = 1
offset = 0
while has_more:
sleep(1)
print(offset)
url = 'https://api-mainnet.magiceden.dev/v2/collections?offset={}&limit=500'.format(offset)
r = requests.get(url)
j = r.json()
data = data + j
has_more = len(j)
offset += 500
df = pd.DataFrame(data)
write_csv(DATA_FOLDER, 'me_collections', df)
# df.to_csv('{}/me_collections.csv'.format(DATA_FOLDER), index=False)
df = read_csv(DATA_FOLDER, 'me_collections')
# df = pd.read_csv('./data/me_collections.csv')
###########################################
# Get 1 Mint From Each Collection #
###########################################
it = 0
l_data = []
# old_l_df = pd.read_csv('./data/me_mints.csv')
old_l_df = read_csv(DATA_FOLDER, 'me_mints')
seen = list(old_l_df.symbol.unique())
print('We\'ve already seen {} / {} mints from ME'.format(len(seen), len(df)))
df = df[ -df.symbol.isin(seen) ]
df = df[ (df.symbol.notnull()) & (df.symbol != '') ]
df = df.sort_values('symbol')
tot = len(df)
start = time.time()
for row in df.iterrows():
sleep(0.5)
it += 1
row = row[1]
# print('Listings on {}...'.format(row['symbol']))
url = 'https://api-mainnet.magiceden.dev/v2/collections/{}/activities?offset=0&limit=1'.format(row['symbol'])
if row['symbol'] in seen:
print('Seen')
continue
try:
r = requests.get(url, headers=headers)
j = r.json()
except:
try:
print('Re-trying in 10s')
sleep(10)
r = requests.get(url, headers=headers)
j = r.json()
except:
try:
print('Re-trying in 60s')
sleep(60)
r = requests.get(url, headers=headers)
j = r.json()
except:
print('Re-trying in 60s (again!)')
sleep(60)
r = requests.get(url, headers=headers)
j = r.json()
if len(j):
l_data += [[ row['symbol'], row['name'], j[0]['tokenMint'] ]]
if it == 1 or it % 10 == 0:
print('#{} / {} ({} records in {} secs)'.format(it, tot, len(l_data), round(time.time() - start)))
# l_df = pd.DataFrame(l_data, columns=['symbol','name','mint_address'])
# l_df.to_csv('./data/me_mints.csv', index=False)
l_df = pd.DataFrame(l_data, columns=['symbol','name','mint_address'])
l_df = pd.concat([l_df, old_l_df]).drop_duplicates(subset=['symbol'])
print('Adding {} rows to me_mints'.format(len(l_df) - len(old_l_df)))
# l_df.to_csv('./data/me_mints.csv', index=False)
write_csv(DATA_FOLDER, 'me_mints', l_df)
######################################################
# Get Update Authorities For All Collections #
######################################################
# l_df = pd.read_csv('./data/me_mints.csv')
# m_old = pd.read_csv('./data/me_update_authorities.csv')
m_old = read_csv(DATA_FOLDER, 'me_update_authorities')
m_old['seen'] = 1
m_data = list(m_old[['symbol','name','update_authority','seen']].values)
seen = [ x[0] for x in m_data ]
print('Seen {} m_data'.format(len(seen)))
l_df = l_df[-l_df.symbol.isin(seen)]
l_df = l_df.sort_values('symbol')
it = 0
for row in l_df.iterrows():
sleep(.5)
it += 1
row = row[1]
symbol = row['symbol']
print('Working on {}...'.format(symbol))
if symbol in seen:
print('Seen')
continue
url = 'https://api-mainnet.magiceden.dev/v2/tokens/{}'.format(row['mint_address'])
try:
r = requests.get(url, headers=headers)
j = r.json()
except:
print('Re-trying in 10s')
sleep(10)
try:
r = requests.get(url, headers=headers)
j = r.json()
except:
print('Re-trying in 60s')
sleep(60)
r = requests.get(url, headers=headers)
j = r.json()
if 'updateAuthority' in j.keys():
m_data += [[ row['symbol'], row['name'], j['updateAuthority'], 0 ]]
if it % 10 == 0:
print('it#{}: {}'.format(it, len(m_data)))
# m_df = pd.DataFrame(m_data, columns=['symbol','name','update_authority'])
# m_df.to_csv('./data/me_update_authorities.csv', index=False)
m_df = pd.DataFrame(m_data, columns=['symbol','name','update_authority','seen'])
m_df = m_df.drop_duplicates()
print('Adding {} rows to me_update_authorities'.format(len(m_df) - len(m_old)))
write_csv(DATA_FOLDER, 'me_update_authorities', m_df)
# m_df.to_csv('./data/me_update_authorities.csv', index=False)
def pull_from_metaboss():
######################################################
# Get Update Authorities For All Collections #
######################################################
# m_df = pd.read_csv('./data/me_update_authorities.csv')
m_df = read_csv(DATA_FOLDER, 'me_update_authorities')
n_auth = m_df.groupby('update_authority').name.count().reset_index().rename(columns={'name':'n_auth'})
m_df = m_df.merge(n_auth)
l1 = len(m_df[ (m_df.seen == 0) & (m_df.n_auth == 1)])
l2 = len(m_df[ (m_df.seen == 0) & (m_df.n_auth > 1)])
print('{} with 1 update_authority; {} with 2+ update_authority'.format(l1, l2))
need = list(m_df[ (m_df.seen == 0) & (m_df.n_auth == 1) ].update_authority.unique())
need = m_df[m_df.update_authority.isin(need)]
# l_df = pd.read_csv('./data/me_mints.csv')
l_df = read_csv(DATA_FOLDER, 'me_mints')
fix = need.merge(l_df[[ 'name','mint_address' ]])
need = fix.copy().rename(columns={'name':'collection'})
# need = need.drop_duplicates(subset=['update_authority']).sort_values('collection').head(7).tail(1)
need['collection'] = need.collection.apply(lambda x: clean_collection_name(x) )
need = need.drop_duplicates(subset=['update_authority']).sort_values('collection')
# need = need.head(2)
mfiles = ['/data/mints/{}/{}_mint_accounts.json'.format(re.sub(' |-', '_', collection), update_authority) for collection, update_authority in zip(need.collection.values, need.update_authority.values) ]
seen = [ x for x in mfiles if os.path.exists(x) ]
seen = []
# for update authorities that have only 1 collection, we can just check metaboss once
mfolder = '{}/mints/'.format(DATA_FOLDER)
it = 0
tot = len(need)
for row in need.iterrows():
it += 1
row = row[1]
collection = row['collection']
print('#{} / {}: {}'.format(it, tot, collection))
# if collection in seen:
# continue
update_authority = row['update_authority']
# print('Working on {}...'.format(collection))
collection_dir = re.sub(' |-', '_', collection)
dir = '{}{}/'.format(mfolder, collection_dir)
mfile = '{}{}_mint_accounts.json'.format(dir, update_authority)
if not os.path.exists(dir):
print(collection)
os.makedirs(dir)
# elif len(os.listdir(dir)) and os.path.exists(mfile):
# print('Already have {}.'.format(collection))
# print('Seen')
# continue
seen.append(update_authority)
os.system('metaboss -r {} -T 300 snapshot mints --update-authority {} --output {}'.format(RPC, update_authority, dir))
# write the mints to csv
data = []
for path in os.listdir(mfolder):
if os.path.isdir('{}{}'.format(mfolder, path)):
collection = re.sub('_', ' ', path).strip()
for fname in os.listdir(mfolder+path):
f = mfolder+path+'/'+fname
if os.path.isfile(f) and '.json' in f:
with open(f) as file:
j = json.load(file)
for m in j:
data += [[ collection, m ]]
df = pd.DataFrame(data, columns=['collection','mint_address'])
df.collection.unique()
write_csv(DATA_FOLDER, 'single_update_auth_labels', df)
# df.to_csv('./data/single_update_auth_labels.csv', index=False)
################################
# Multiple Authorities #
################################
need = list(m_df[ (m_df.seen == 0) & (m_df.n_auth > 1) ].update_authority.unique())
need = m_df[m_df.update_authority.isin(need)]
fix = need.merge(l_df[[ 'name','mint_address' ]])
need = fix.copy().rename(columns={'name':'collection'})
need['collection'] = need.collection.apply(lambda x: clean_collection_name(x) )
need = need.sort_values('collection').drop_duplicates(subset=['update_authority'], keep='first')
# need = need.head(2)
it = 0
a = []
for row in need.iterrows():
it += 1
print('#{}/{}'.format(it, len(need)))
row = row[1]
collection = row['collection']
update_authority = row['update_authority']
print('Working on {}...'.format(collection))
collection_dir = re.sub(' |-', '_', collection)
dir = '{}{}/'.format(mfolder, collection_dir)
mfile = '{}{}_mint_accounts.json'.format(dir, update_authority)
if not os.path.exists(dir):
print(collection)
os.makedirs(dir)
a.append(update_authority)
os.system('metaboss -r {} -T 300 snapshot mints --update-authority {} --output {}'.format(RPC, update_authority, dir))
odir = dir+'output/'
if not os.path.exists(odir):
print('Making dir {}'.format(odir))
os.makedirs(odir)
os.system('metaboss -r {} -T 300 decode mint --list-file {} --output {}'.format(RPC, mfile, odir ))
##################################################
# Load All The Mints for Each Collection #
##################################################
# now that we have the mints, create a data frame with the info for each mint in each collection
mfolder = '{}/mints/'.format(DATA_FOLDER)
data = []
seen = [ x[1] for x in data ]
it = 0
dirs = sorted(os.listdir(mfolder))
dirs = [ x for x in dirs if not x in ['3D_Sniping_Demons']]
tot = len(dirs)
for path in dirs:
print('{} / {} ({} records)'.format(it, tot, len(data)))
it += 1
if os.path.isdir(mfolder+path):
collection = re.sub('_', ' ', path).strip()
print('Found {}'.format(collection))
if not os.path.exists(mfolder+path+'/output/'):
print('No output')
continue
fnames = os.listdir(mfolder+path+'/output/')
print('{} files found'.format(len(fnames)))
for fname in fnames:
f = mfolder+path+'/output/'+fname
if fname[:-5] in seen:
continue
if os.path.isfile(f) and '.json' in f:
try:
with open(f) as file:
j = json.load(file)
data += [[ collection, fname, j['name'], j['symbol'], j['uri'] ]]
except:
print('Error {}'.format(fname[:-5]))
##################################################
# Load All The Mints for Each Collection #
##################################################
new_mints = pd.DataFrame(data, columns=['collection','mint_address','name','symbol','uri'])
# tmp = tmp[-(tmp.collection.isin(['Dskullys','Decimusdynamics']))]
n = len(new_mints[(new_mints.uri.isnull()) | (new_mints.uri == '')])
tot = len(new_mints)
pct = round(n * 100 / tot, 1)
print('{} ({}%) rows have no uri'.format(n, pct))
new_mints = new_mints[new_mints.uri != '']
# function to clean the name of each NFT (remove the number)
def f_cn(x):
if not x or x != x:
return(x)
if '#' in x[-6:]:
x = ''.join(re.split('#', x)[:-1]).strip()
elif bool(re.match('.+\s+[0-9]+', x)):
x = ' '.join(re.split(' ', x)[:-1]).strip()
return(x)
new_mints['clean_name'] = new_mints.name.apply(lambda x: f_cn(x) )
# determine for each collection if we should look at collection-name-symbol, collection-symbol, or just collection to determine what collection it actuallly belongs to
# this logic is because e.g. some only have a few names in the collection so we can iterate, but some have a different name for each NFT, so we assume its the same collection for all
a = new_mints.drop_duplicates(subset=['collection','clean_name','symbol']).groupby(['collection']).uri.count().reset_index().sort_values('uri', ascending=0)
symbol_only = a[a.uri > 10].collection.unique()
b = new_mints[new_mints.collection.isin(symbol_only)].drop_duplicates(subset=['collection','symbol']).groupby(['collection']).uri.count().reset_index().sort_values('uri', ascending=0)
collection_only = b[b.uri > 10].collection.unique()
symbol_only = [x for x in symbol_only if not x in collection_only]
# now get the info for each collection-name-symbol combo
g1 = new_mints[ (-(new_mints.collection.isin(symbol_only))) & (-(new_mints.collection.isin(collection_only))) ].groupby(['collection','clean_name','symbol']).head(1).reset_index()
g2 = new_mints[ ((new_mints.collection.isin(symbol_only))) & (-(new_mints.collection.isin(collection_only))) ].groupby(['collection','symbol']).head(1).reset_index()
g3 = new_mints[ (-(new_mints.collection.isin(symbol_only))) & ((new_mints.collection.isin(collection_only))) ].groupby(['collection']).head(1).reset_index()
g = pd.concat([g1, g2, g3]).drop_duplicates(subset=['mint_address'])
print('{} Total: {} all, {} collection-symbol {} collection'.format(len(g), len(g1), len(g2), len(g3)))
# g.to_csv('~/Downloads/tmp-g.csv', index=False)
# iterate over each row to get what collection they are actually in
# by pulling data from the uri
uri_data = []
it = 0
tot = len(g)
print(tot)
errs = []
seen = [ x['uri'] for x in uri_data ]
# for row in g[ -(g.uri.isin(seen)) ].iterrows():
for row in g.iterrows():
row = row[1]
it += 1
# if it % 100 == 0:
# uri_df = pd.DataFrame(uri_data)[[ 'collection','name','symbol','row_symbol','row_collection','uri','row_clean_name','mint_address' ]]
# uri_df.to_csv('~/Downloads/uri_df.csv', index=False)
print('#{} / {}: {}'.format(it, tot, row['collection']))
try:
r = requests.get(row['uri'])
j = r.json()
j['uri'] = row['uri']
j['row_collection'] = row['collection']
j['row_clean_name'] = row['clean_name']
j['row_symbol'] = row['symbol']
j['mint_address'] = row['mint_address']
uri_data += [j]
except:
print('Error')
errs.append(row)
uri_df = pd.DataFrame(uri_data)[[ 'collection','name','symbol','row_symbol','row_collection','uri','row_clean_name','mint_address' ]]
write_csv(DATA_FOLDER, 'uri_df', uri_df)
# uri_df.to_csv('~/Downloads/uri_df.csv', index=False)
# for each row, parse the json from the uri
# uri_df = pd.read_csv('~/Downloads/uri_df.csv')
# read_csv(DATA_FOLDER, 'uri_df')
def f(x, c):
x = str(x)
try:
n = json.loads(re.sub("'", "\"", x))[c]
if type(n) == list:
return(n[0])
return(n)
except:
try:
return(json.loads(re.sub("'", "\"", x))[c])
except:
try:
return(json.loads(re.sub("'", "\"", x))[0][c])
except:
try:
return(json.loads(re.sub("'", "\"", x))[0])
except:
return(x)
# parse the json more
uri_df['parsed_collection'] = uri_df.collection.apply(lambda x: f(x, 'name') )
uri_df['parsed_family'] = uri_df.collection.apply(lambda x: f(x, 'family') )
uri_df['clean_name'] = uri_df.name.apply( lambda x: f_cn(x) )
# calculate what the collection name is
uri_df['use_collection'] = uri_df.parsed_collection.replace('', None).fillna( uri_df.clean_name )#.fillna( uri_df.row_symbol )
# uri_df[uri_df.use_collection == 'nan'][['use_collection','parsed_collection','parsed_family','clean_name','name','collection','symbol','row_symbol','row_collection']].head()
# uri_df[uri_df.use_collection == 'nan'][['use_collection','parsed_collection','parsed_family','clean_name','name','collection','symbol','row_symbol','row_collection']].to_csv('~/Downloads/tmp.csv', index=False)
len(uri_df)
# clean the collection name
def f1(x):
try:
if len(x['use_collection']) == 1:
return(x['clean_name'])
if bool(re.match('.+\s+#[0-9]+', x['use_collection'])):
return(''.join(re.split('#', x['use_collection'])[:-1]).strip())
if '{' in x['use_collection']:
return(x['clean_name'])
return(x['use_collection'].strip().title())
except:
return(x['use_collection'].strip().title())
uri_df['tmp'] = uri_df.apply(lambda x: f1(x), 1 )
uri_df['use_collection'] = uri_df.apply(lambda x: f1(x), 1 )
# clean the mint_address
uri_df['mint_address'] = uri_df.mint_address.apply(lambda x: re.sub('.json','', x))
uri_df = uri_df.fillna('None')
for i in range(2):
# for each collection-name-symbol combo, see how many have multiple mappings
a = uri_df.copy().fillna('None')
a = a[['row_collection','row_clean_name','row_symbol','use_collection']].drop_duplicates().groupby(['row_collection','row_clean_name','row_symbol']).use_collection.count().reset_index().rename(columns={'use_collection':'n_1'})
uri_df = merge(uri_df, a, ensure=True)
# for each collection-symbol combo, see how many have multiple mappings
a = uri_df.copy().fillna('None')
a = a[['row_collection','row_symbol','use_collection']].drop_duplicates().groupby(['row_collection','row_symbol']).use_collection.count().reset_index().rename(columns={'use_collection':'n_2'})
uri_df = merge(uri_df, a, ensure=True)
# for each collection combo, see how many have multiple mappings
a = uri_df.copy().fillna('None')
a = a[['row_collection','use_collection']].drop_duplicates().groupby(['row_collection']).use_collection.count().reset_index().rename(columns={'use_collection':'n_3'})
uri_df = merge(uri_df, a, ensure=True)
uri_df['n'] = uri_df.apply(lambda x: x['n_3'] if x['row_collection'] in collection_only else x['n_2'] if x['row_collection'] in symbol_only else x['n_1'], 1 )
print('{} / {} ({}%) have multiple collection-name-symbol mappings'.format(len(uri_df[uri_df.n > 1]), len(uri_df), round( 100.0 * len(uri_df[uri_df.n > 1]) / len(uri_df))))
# if there is multiple, use the parsed_family instead of the use_collection
uri_df['use_collection'] = uri_df.apply(lambda x: x['use_collection'] if x['n'] == 1 else x['parsed_family'], 1 )
del uri_df['n_1']
del uri_df['n_2']
del uri_df['n_3']
# only take rows where there is a single mapping
m = uri_df[uri_df.n==1][[ 'use_collection','row_collection','row_clean_name','row_symbol' ]].dropna().drop_duplicates()
m.columns = [ 'use_collection','collection','clean_name','symbol' ]
m_1 = new_mints[ (-(new_mints.collection.isin(symbol_only))) & (-(new_mints.collection.isin(collection_only))) ].fillna('').merge(m.fillna(''), how='left')
m_2 = new_mints[ ((new_mints.collection.isin(symbol_only))) & (-(new_mints.collection.isin(collection_only))) ][[ 'collection','mint_address','symbol' ]].fillna('').merge(m.fillna(''), how='left')
m_3 = new_mints[ (-(new_mints.collection.isin(symbol_only))) & ((new_mints.collection.isin(collection_only))) ][[ 'collection','mint_address' ]].fillna('').merge(m.fillna(''), how='left')
len(m_1) + len(m_2) + len(m_3)
len(new_mints)
# m = new_mints.fillna('').merge(m.fillna(''), how='left')
m = pd.concat( [m_1, m_2, m_3] )
print('After all this, we have {}% of the mints'.format( round(len(m) * 100 / len(new_mints)) ))
len(new_mints)
len(m)
m['mint_address'] = m.mint_address.apply(lambda x: re.sub('.json', '', x) )
m = m[['mint_address','use_collection']].dropna().drop_duplicates()
m.columns = ['mint_address','collection']
m[m.collection.isnull()].head()
m[m.collection=='Nan'].head()
m = m[m.collection != 'Nan']
tmp = m.groupby('collection').mint_address.count().reset_index().sort_values('mint_address', ascending=0)
tmp.head()
# m.to_csv('./data/mult_update_auth_labels.csv', index=False)
write_csv(DATA_FOLDER, 'mult_update_auth_labels', m)
def compile():
single_update_auth_labels = read_csv(DATA_FOLDER, 'single_update_auth_labels')
mult_update_auth_labels = read_csv(DATA_FOLDER, 'mult_update_auth_labels')
howrare_labels = read_csv(DATA_FOLDER, 'howrare_labels')
df = pd.concat([howrare_labels, single_update_auth_labels, mult_update_auth_labels])
df = df[ (df.collection != 'Nan') & (df.collection != 'nan') & (df.collection.notnull()) ]
df = df[ (df.mint_address != 'Nan') & (df.mint_address != 'nan') & (df.mint_address.notnull()) ]
df = df.drop_duplicates(subset=['mint_address'], keep='first')
write_csv(DATA_FOLDER, 'solana_nft_labels', df[['mint_address','collection']])
# print('Loaded!')
# mints_from_me()
# pull_from_metaboss()
# compile()
# how_rare_is_api()