This commit is contained in:
flipside-kellen 2021-12-23 12:00:31 -08:00
parent e14a6b539b
commit 9dd1d71538
10 changed files with 887 additions and 427 deletions

View File

@ -55,8 +55,10 @@ def run_queries():
metadata = ctx.cursor().execute(' '.join(query))
metadata = pd.DataFrame.from_records(iter(metadata), columns=[x[0] for x in metadata.description])
metadata = clean_colnames(metadata)
metadata['image'] = metadata.image.apply(lambda x: 'https://cloudflare-ipfs.com/ipfs/'+re.split('/', x)[-1] )
metadata['collection'] = c
metadata['chain'] = 'Terra'
list(metadata.image.values[:2]) + list(metadata.image.values[-2:])
metadata.to_csv('./data/metadata/{}.csv'.format(c), index=False)
# old = pd.read_csv('./data/metadata.csv')
# old = old[-old.collection.isin(metadata.collection.unique())]
@ -75,7 +77,7 @@ def add_terra_tokens():
, msg_value:execute_msg:mint_nft:extension:name AS name
, msg_value:execute_msg:mint_nft:extension:image AS image
FROM terra.msgs
WHERE msg_value:contract::string = 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2'
WHERE msg_value:contract::string = 'terra16wuzgsx3tz4hkqu73q5s7unxenefkkvefvewsh'
AND tx_status = 'SUCCEEDED'
AND msg_value:execute_msg:mint_nft is not null
'''
@ -164,6 +166,9 @@ def add_terra_metadata():
metadata['attribute_count'] = 0
l = len(metadata)
incl_att_count = not collection in [ 'Levana Dragon Eggs' ]
metadata.groupby('cracking_date').token_id.count()
metadata.groupby('weight').token_id.count()
metadata[metadata.cracking_date=='2471-12-22'][['token_id']]
for c in list(metadata.columns) + ['attribute_count']:
if c in ['token_id','collection','pct','levana_rank','meteor_id']:
continue
@ -186,11 +191,18 @@ def add_terra_metadata():
# metadata.sort_values('pct_rank')
metadata.sort_values('pct')
metadata['rank'] = metadata.pct.rank()
metadata['score'] = metadata.pct.apply(lambda x: 1.0 / x )
mn = metadata.score.min()
metadata['score'] = metadata.score.apply(lambda x: x / mn )
metadata.score.max()
metadata.sort_values('rank')[['rank','pct','score']]
metadata['rarity_score'] = metadata.pct.apply(lambda x: 1.0 / (x**0.2) )
mn = metadata.rarity_score.min()
mx = metadata.rarity_score.max()
metadata['rarity_score'] = metadata.rarity_score.apply(lambda x: round(((x - mn) * 999 / (mx - mn)) + 1) )
metadata.sort_values('rarity_score', ascending=0).head(20)[['token_id','collection_rank','rarity_score']]
metadata.sort_values('rarity_score', ascending=0).tail(20)[['token_id','collection_rank','rarity_score']]
metadata[metadata.token_id==6157].sort_values('rarity_score', ascending=0).tail(20)[['token_id','collection_rank','rarity_score','rank']]
metadata[metadata['rank']>=3000].groupby('weight').token_id.count()
metadata.rarity_score.max()
metadata.rarity_score.min()
metadata.sort_values('rank')[['rank','pct','rarity_score']]
m = pd.DataFrame()
for c in metadata.columns:
@ -201,16 +213,20 @@ def add_terra_metadata():
m = m.append(cur)
m['chain'] = 'Terra'
m.groupby('feature_name').feature_value.count()
m[m.feature_name=='face'].groupby('feature_value').token_id.count()
print(len(m.token_id.unique()))
if collection == 'Levana Dragon Eggs':
add = m[m.feature_name=='collection_rank']
add['feature_name'] = 'transformed_collection_rank'
add['feature_value'] = add.feature_value.apply(lambda x: (1.0/ (x + 0.5))**1 )
m = m.append(add)
g = m.groupby('feature_value').feature_name.count().reset_index().sort_values('feature_name').tail(50)
old = pd.read_csv('./data/metadata.csv')
if not 'chain' in old.columns:
old['chain'] = old.collection.apply(lambda x: 'Terra' if x in [ 'Galactic Punks', 'LunaBulls' ] else 'Solana' )
old = old[-old.collection.isin(m.collection.unique())]
old = old.append(m)
old = old.drop_duplicates()
print(old.groupby(['chain','collection']).token_id.count())
old = old.drop_duplicates(subset=['collection','token_id','feature_name'])
old = old[-(old.feature_name.isin(['last_sale']))]
# print(old.groupby(['chain','collection']).token_id.count())
print(old[['chain','collection','token_id']].drop_duplicates().groupby(['chain','collection']).token_id.count())
old.to_csv('./data/metadata.csv', index=False)

View File

@ -4,7 +4,7 @@ WITH legendary_traits AS (
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as tokenid,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -39,7 +39,7 @@ WITH legendary_traits AS (
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as tokenid,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -74,7 +74,7 @@ WITH legendary_traits AS (
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as tokenid,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -109,7 +109,7 @@ WITH legendary_traits AS (
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as tokenid,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,

View File

@ -5,7 +5,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -38,7 +38,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -74,7 +74,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -109,7 +109,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -144,7 +144,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
@ -179,7 +179,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,

View File

@ -4,7 +4,7 @@ select block_timestamp,
block_id,
tx_id,
msg_value:execute_msg:mint:extension:name::string as name,
CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
msg_value:execute_msg:mint:extension:image::string as image,
msg_value:execute_msg:mint:token_id::string as token_id,
msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,

View File

@ -39,16 +39,18 @@ def clean_name(name):
def scrape_randomearth():
d_address = {
'Galactic Punks': 'terra103z9cnqm8psy0nyxqtugg6m7xnwvlkqdzm4s4k',
'LunaBulls': 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2'
'LunaBulls': 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2',
'Levana Dragon Eggs': 'terra1k0y373yxqne22pc9g7jvnr4qclpsxtafevtrpg',
}
data = []
for collection in [ 'Galactic Punks', 'LunaBulls' ]:
# for collection in [ 'Levana Dragon Eggs' ]:
for collection in d_address.keys():
print(collection)
page = 0
has_more = True
while has_more:
page += 1
print('Page #{}'.format(page))
print('Page #{} ({})'.format(page, len(data)))
url = 'https://randomearth.io/api/items?collection_addr={}&sort=price.asc&page={}&on_sale=1'.format( d_address[collection], page)
browser.get(url)
soup = BeautifulSoup(browser.page_source)
@ -59,6 +61,7 @@ def scrape_randomearth():
for i in j['items']:
data += [[ 'Terra', collection, i['token_id'], i['price'] / (10 ** 6) ]]
df = pd.DataFrame(data, columns=['chain','collection','token_id','price'])
df.to_csv('~/Downloads/tmp.csv', index=False)
old = pd.read_csv('./data/listings.csv')
old = old[-old.collection.isin(df.collection.unique())]
old = old.append(df)
@ -189,9 +192,12 @@ def convert_collection_names():
,'boryokudragonz': 'Boryoku Dragonz'
}
for c in [ 'pred_price', 'attributes', 'feature_values', 'model_sales', 'listings', 'coefsdf', 'tokens' ]:
try:
df = pd.read_csv('./data/{}.csv'.format(c))
df['collection'] = df.collection.apply(lambda x: clean_name(x) if x in d.keys() else x )
df.to_csv('./data/{}.csv'.format(c), index=False)
except:
pass
def scrape_recent_sales():
o_sales = pd.read_csv('./data/sales.csv')
@ -234,6 +240,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
, 'degenapes': 'degen-ape-academy'
, 'peskypenguinclub': 'pesky-penguins'
}
collection = 'smb'
for collection in collections:
if collection == 'boryokudragonz':
continue
@ -249,7 +256,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
print('{} page #{} ({})'.format(collection, page, len(data)))
sleep(3)
page += 1
for j in [25, 30, 35, 30, 25] * 2:
for j in [20, 30, 30, 30, 30, 30, 30, 30] * 1:
for _ in range(1):
soup = BeautifulSoup(browser.page_source)
# for row in browser.find_elements_by_class_name('ag-row'):
@ -325,6 +332,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
pred_price = pd.read_csv('./data/pred_price.csv')[['collection','token_id','pred_price','pred_sd']]
pred_price['collection'] = pred_price.collection.apply(lambda x: clean_name(x))
pred_price['token_id'] = pred_price.token_id.astype(str)
pred_price = pred_price.merge(listings)
coefsdf = pd.read_csv('./data/coefsdf.csv')
@ -338,7 +346,10 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
metadata = pd.read_csv('./data/metadata.csv')
solana_blob = metadata[ (metadata.collection == 'aurory') & (metadata.feature_name == 'skin') & (metadata.feature_value == 'Solana Blob (9.72%)')].token_id.unique()
pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 8 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )
pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 4 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )
solana_blob = metadata[ (metadata.collection == 'aurory') & (metadata.feature_name == 'hair') & (metadata.feature_value == 'Long Blob Hair (9.72%)')].token_id.unique()
pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 2 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )
pred_price['abs_chg'] = (pred_price.floor - pred_price.floor_price) * pred_price.lin_coef
pred_price['pct_chg'] = (pred_price.floor - pred_price.floor_price) * pred_price.log_coef
@ -711,6 +722,7 @@ def scratch():
# print('Sleeping until {}'.format(sleep_to))
# sleep(60 * 15)
alerted = []
scrape_randomearth()
alerted = scrape_listings(alerted = alerted)
# scrape_randomearth()
# alerted = scrape_listings(['smb'],alerted = alerted)
convert_collection_names()

View File

@ -10,7 +10,7 @@ import tensorflow as tf
import snowflake.connector
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
warnings.filterwarnings('ignore')
@ -18,26 +18,45 @@ warnings.filterwarnings('ignore')
os.chdir('/Users/kellenblumberg/git/nft-deal-score')
CHECK_EXCLUDE = False
CHECK_EXCLUDE = True
# CHECK_EXCLUDE = True
# Using sales from howrare.is - the last sale that was under 300 was when the floor was at 72. Filtering for when the floor is >100, the lowest sale was 400
###################################
# Define Helper Functions #
###################################
def standardize_df(df, cols, usedf=None):
def standardize_df(df, cols, usedf=None, verbose=False):
for c in cols:
if type(usedf) != type(pd.DataFrame()):
usedf = df
mu = usedf[c].mean()
sd = usedf[c].std()
# print(c)
if verbose:
print(c)
if len(df[c].unique()) == 2 and df[c].max() == 1 and df[c].min() == 0:
df['std_{}'.format(c)] = df[c].apply(lambda x: (x*2) - 1 )
else:
df['std_{}'.format(c)] = (df[c] - mu) / sd
return(df)
def merge(left, right, on=None, how='inner', ensure=True, verbose=True):
df = left.merge(right, on=on, how=how)
if len(df) != len(left) and (ensure or verbose):
print('{} -> {}'.format(len(left), len(df)))
cur = left.merge(right, on=on, how='left')
cols = set(right.columns).difference(set(left.columns))
print(cols)
col = list(cols)[0]
missing = cur[cur[col].isnull()]
print(missing.head())
if ensure:
assert(False)
return(df)
def just_float(x):
x = re.sub('[^\d\.]', '', str(x))
return(float(x))
def calculate_percentages(df, cols=[]):
add_pct = not 'pct' in df.columns
if not len(cols):
@ -64,13 +83,9 @@ exclude = [
# ( 'aurory', 3323, 138 )
]
s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
s_df[ s_df.collection == 'Levana Dragons' ].sort_values('block_timestamp', ascending=0).head()
print(len(s_df[s_df.collection == 'Levana Dragon Eggs']))
print(s_df.groupby('collection').token_id.count())
s_df.collection.unique()
s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
for e in exclude:
s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
@ -82,241 +97,183 @@ if not CHECK_EXCLUDE:
s_df = s_df[s_df.exclude.isnull()]
del s_df['exclude']
#########################
# Load Metadata #
#########################
m_df = pd.read_csv('./data/metadata.csv')
m_df['token_id'] = m_df.token_id.astype(str)
tmp = m_df[m_df.collection.isin(['Levana Dragon Eggs','Levana Meteors','Levana Dust'])]
tmp['tmp'] = tmp.token_id.astype(int)
tmp.groupby('collection').tmp.max()
m_df.head()
# s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d %H:%M:%S') )
# remove ones that are not actually metadata
m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
m_df['feature_value'] = m_df.feature_value.apply(lambda x: re.split("\(", re.sub("\"", "", x))[0] if type(x)==str else x )
m_df[(m_df.feature_name=='rank') & (m_df.collection == 'Levana Dragon Eggs')]
#####################################
# Exclude Special LunaBulls #
#####################################
tokens = pd.read_csv('./data/tokens.csv')
tokens.token_id.unique()
lunabullsrem = tokens[tokens.clean_token_id>=10000].token_id.unique()
m_df = m_df[ -((m_df.collection == 'LunaBulls') & (m_df.token_id.isin(lunabullsrem))) ]
s_df = s_df[ -((s_df.collection == 'LunaBulls') & (s_df.token_id.isin(lunabullsrem))) ]
###########################
# Calculate Floor #
###########################
s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
s_df['timestamp'] = s_df.block_timestamp.astype(int)
# del metadata['price']
# del metadata['last_sale']
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
s_df[[ 'block_timestamp','days_ago' ]].drop_duplicates(subset=['days_ago'])
s_df['av_20'] = s_df.groupby('collection')['mn_20'].rolling(20).mean().reset_index(0,drop=True)
s_df = s_df.sort_values(['collection','block_timestamp'])
# s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).median().reset_index(0,drop=True)
s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
# s_df[ (-((s_df.price) >= (s_df.md_20 * 0.2))) & (s_df.price.notnull()) & (s_df.collection == 'Levana Dragon Eggs') ]
s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.75) ]
# lowest price in last 20 sales
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
# exclude sales that are far below the existing floor
s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.70) ]
# 10%ile of last 20 sales
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
s_df = s_df.sort_values(['collection','block_timestamp'])
# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).min().reset_index(0,drop=True)
s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.1).reset_index(0,drop=True)
s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(21).tail(40)
s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(20).sort_values('price')
s_df['tmp'] = s_df.mn_20 / s_df.md_20
tmp = s_df[s_df.collection=='smb'][['mn_20','block_timestamp']]
tmp['date'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
tmp = tmp.groupby('date').mn_20.median().reset_index()
tmp.to_csv('~/Downloads/tmp.csv', index=False)
s_df['tmp'] = s_df.price / s_df.mn_20
s_df[s_df.collection == 'smb'].sort_values('block_timestamp')[['token_id','price','mn_20']]
s_df[s_df.collection == 'smb'].sort_values('tmp').head(20)[['collection','token_id','price','mn_20','tmp']]
s_df.groupby('collection').tmp.median()
s_df.groupby('collection').tmp.mean()
s_df.sort_values('tmp').head()
s_df['tmp'] = s_df.price / s_df.mn_20
s_df[['collection','token_id','block_timestamp','price','mn_20','md_20','av_20','tmp']].to_csv('~/Downloads/tmp.csv', index=False)
s_df.groupby('collection').tmp.median()
s_df.groupby('collection').tmp.mean()
s_df.sort_values('tmp', ascending=0).head()
s_df.head(21)
m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
# m_df['feature_value'] = m_df.feature_value.apply(lambda x: x.strip() )
# m_df.feature_value.unique()
pred_cols = {}
metadata = {}
sales = {}
collection_features = {}
m_df[(m_df.collection == 'Galactic Punks') & (m_df.feature_name == 'pct')].sort_values('token_id')
c = 'Galactic Punks'
EXCLUDE_COLS = {
'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
}
for c in s_df.collection.unique():
print('Building {} model'.format(c))
sales[c] = s_df[ s_df.collection == c ]
exclude = EXCLUDE_COLS[c] if c in EXCLUDE_COLS.keys() else []
pred_cols[c] = sorted([x for x in m_df[ m_df.collection == c ].feature_name.unique() if not x in exclude])
collection_features[c] = [ c for c in pred_cols[c] if not c in ['score','rank','pct']+exclude ]
metadata[c] = m_df[ (m_df.collection == c) & (-(m_df.feature_name.isin(exclude))) ]
# tmp = pd.pivot_table( metadata[c], ['collection','token_id'], columns=['feature_name'], values=['feature_value'] )
metadata[c] = metadata[c].pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
metadata[c].columns = [ 'collection','token_id' ] + pred_cols[c]
features = collection_features[c]
cur = metadata[c]
cur = cur.dropna(subset=features)
for f in features:
if type(cur[f].values[0] == str):
cur[f] = cur[f].apply(lambda x: re.sub("\"", "", str(x) ) )
cur[f] = cur[f].apply(lambda x: re.split("\(", x )[0].strip())
cur = cur.replace('', 'Default')
# if not 'pct' in cur.columns:
cur = calculate_percentages( cur, features )
dummies = pd.get_dummies(cur[features])
feature_cols = dummies.columns
cur = pd.concat([ cur.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
metadata[c] = cur
# pred_cols[c] = ['rank','score','timestamp','mn_20','log_mn_20'] + list(dummies.columns)
cols = [ 'collection_rank' ]
cols = [ ]
pred_cols[c] = [ 'rank','transform_rank','score'] + [x for x in cols if x in m_df.feature_name.unique()] + list(dummies.columns)
# collection_features = {
# 'Hashmasks': [ 'character','eyecolor','item','mask','skincolor' ]
# , 'Galactic Punks': [ 'backgrounds','hair','species','suits','jewelry','headware','glasses' ]
# , 'Solana Monkey Business': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# , 'Aurory': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# # , 'Thugbirdz': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# }
excludedf = pd.DataFrame()
###########################
# Calculate Floor #
###########################
coefsdf = pd.DataFrame()
salesdf = pd.DataFrame()
attributes = pd.DataFrame()
pred_price = pd.DataFrame()
feature_values = pd.DataFrame()
collections = sorted(metadata.keys())
collection = 'Galactic Punks'
tokens = pd.read_csv('./data/tokens.csv')
collection = 'Levana Dragon Eggs'
# for collection in s_df.collection.unique():
for collection in ['Levana Dragon Eggs']:
# collection = 'LunaBulls'
# collection = 'smb'
# collection = 'aurory'
# collection = 'meerkatmillionaires'
# non-binary in model: collection_rank, temperature, weight
# non-binary in model; exclude from rarity: pct, rank, score
# exclude from model: lucky_number, shower
# exclude from model and rarity %: meteor_id, attribute_count, cracking_date
ALL_NUMERIC_COLS = ['rank','score','pct']
MODEL_EXCLUDE_COLS = {
# 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
'Levana Dragon Eggs': ['meteor_id','shower','lucky_number','cracking_date','attribute_count']
}
RARITY_EXCLUDE_COLS = {
# 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
'Levana Dragon Eggs': ['meteor_id','attribute_count','collection_rank','transformed_collection_rank','rarity_score']
}
NUMERIC_COLS = {
'Levana Dragon Eggs': ['collection_rank','weight','temperature','transformed_collection_rank','rarity_score']
}
ATT_EXCLUDE_COLS = {
'Levana Dragon Eggs': ['attribute_count','transformed_collection_rank']
}
# for collection in [ 'Levana Dragon Eggs' ]:
for collection in s_df.collection.unique():
print('Working on collection {}'.format(collection))
p_metadata = metadata[collection]
if 'attribute_count' in p_metadata.columns:
p_metadata['attribute_count'] = p_metadata.attribute_count.astype(float).astype(int)
sales = s_df[ s_df.collection == collection ]
metadata = m_df[ m_df.collection == collection ]
metadata[metadata.token_id == '1']
metadata[metadata.feature_name == 'rank']
metadata.feature_name.unique()
p_sales = sales[collection]
# specify the predictive features
p_pred_cols = pred_cols[collection]
p_features = collection_features[collection]
p_sales['token_id'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
p_metadata['token_id'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
for c in [ 'rank','score' ]:
p_metadata[c] = p_metadata[c].astype(float)
# p_sales['contract_address'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
# p_metadata['contract_address'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
p_sales['contract_address'] = ''
p_metadata['contract_address'] = ''
# categorize columns
all_names = sorted(metadata.feature_name.unique())
model_exclude = MODEL_EXCLUDE_COLS[collection] if collection in MODEL_EXCLUDE_COLS.keys() else []
num_features = sorted((NUMERIC_COLS[collection] if collection in NUMERIC_COLS.keys() else []) + ALL_NUMERIC_COLS)
num_features = [ x for x in num_features if x in metadata.feature_name.unique() ]
num_metadata = metadata[metadata.feature_name.isin(num_features)]
num_metadata[num_metadata.feature_name == 'rank']
cat_features = sorted([ x for x in all_names if not x in (model_exclude + num_features) ])
cat_metadata = metadata[metadata.feature_name.isin(cat_features)]
# remove 1 columns for each group (since they are colinear)
# exclude = []
# for f in p_features:
# e = [ c for c in p_pred_cols if c[:len(f)] == f ][-1]
# exclude.append(e)
# create dummies for binary variables
num_metadata = num_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
num_metadata.columns = [ 'collection','token_id' ] + num_features
df = p_sales.merge(p_metadata, on=['token_id','contract_address'])
df = df[df.mn_20.notnull()]
# create dummies for binary variables
cat_metadata = cat_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
cat_metadata.columns = [ 'collection','token_id' ] + cat_features
cat_metadata = calculate_percentages( cat_metadata, cat_features )
dummies = pd.get_dummies(cat_metadata[cat_features])
cat_metadata = pd.concat([ cat_metadata.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
del cat_metadata['pct']
pred_cols = num_features + list(dummies.columns)
# create training df
df = merge(sales, num_metadata, ['collection','token_id'], ensure=False)
df = merge(df, cat_metadata, ['collection','token_id'])
for c in num_features:
df[c] = df[c].apply(lambda x: just_float(x))
# create target cols
target_col = 'adj_price'
df[target_col] = df.apply(lambda x: max(0.7 * (x['mn_20'] - 0.2), x['price']), 1 )
# df['mn_20'] = df.apply(lambda x: min(x[target_col], x['mn_20']), 1 )
# tmp = df[['block_timestamp','mn_20']].copy()
# tmp['tmp'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
# tmp = tmp.groupby('tmp').mn_20.median().reset_index()
# tmp.sort_values('tmp').to_csv('~/Downloads/tmp.csv', index=False)
# df['timestamp'] = df.block_timestamp.astype(int)
df = df[df[target_col].notnull()]
df = df.reset_index(drop=True)
df['transform_rank'] = df['rank'].apply(lambda x: 1.0 / (x**2) )
df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
df['rel_price_0'] = df[target_col] - df.mn_20
df['rel_price_1'] = df[target_col] / df.mn_20
df = df[df.mn_20 > 0]
df['log_mn_20'] = np.log(df.mn_20)
print('Training on {} sales'.format(len(df)))
# df['price_median'] = df.groupby('token_id').price.median()
df = standardize_df(df, pred_cols)
# standardize columns to mean 0 sd 1
len(p_pred_cols)
df = standardize_df(df, p_pred_cols)
std_pred_cols_0 = [ 'std_{}'.format(c) for c in p_pred_cols ]
# p_pred_cols = [ c for c in p_pred_cols if not c in exclude ]
std_pred_cols = [ 'std_{}'.format(c) for c in p_pred_cols ]
df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
# df.sort_values('block_timestamp').head(10)[['price','tx_id']]
# df.sort_values('block_timestamp').head(10)[['price','tx_id']].tx_id.values
# df = df[df.price >= 1]
std_pred_cols_0 = [ 'std_{}'.format(c) for c in pred_cols ]
std_pred_cols = [ 'std_{}'.format(c) for c in pred_cols ]
#########################
# Run the Model #
#########################
len(df)
len(df.dropna(subset=std_pred_cols))
tmp = df[std_pred_cols].count().reset_index()
tmp.columns = ['a','b']
tmp.sort_values('b').head(20)
rem = list(tmp[tmp.b==0].a.values)
std_pred_cols = [ c for c in std_pred_cols if not c in rem ]
# if collection == 'Levana Dragon Eggs':
# std_pred_cols = [ 'std_genus_Titan','std_score','std_weight','std_transformed_collection_rank','std_collection_rank','std_legendary_composition_None','std_ancient_composition_None' ]
mn = df.timestamp.min()
mx = df.timestamp.max()
df['weight'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
df['wt'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
X = df[std_pred_cols].values
mu = df.log_price.mean()
sd = df.log_price.std()
df['std_log_price'] = (df.log_price - mu) / sd
# y = df.std_log_price.values
# y = df[target_col].values
# y = df.rel_price_1.values
y_0 = df.rel_price_0.values
y_1 = df.rel_price_1.values
# y_log = df.log_price.values
clf_lin = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_lin.fit(X, y_0, df.weight.values)
# run the linear model
clf_lin = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
# clf_lin = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_lin.fit(X, y_0, df.wt.values)
if collection == 'Levana Dragon Eggs':
coefs = []
for a, b in zip(std_pred_cols, clf_lin.coef_):
coefs += [[a,b]]
coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
coefs.to_csv('~/Downloads/levana_lin_coefs.csv', index=False)
df['pred_lin'] = clf_lin.predict(X)
df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
df['err_lin'] = abs(((df.pred_lin - df[target_col]) / df[target_col]) )
# df['err_lin'] = abs(df.pred_lin - df.price )
# df[[ 'price','pred_lin','err_lin','mn_20' ]].sort_values('err_lin').tail(50)
df.head()
clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_log.fit(X, y_1, df.weight.values)
# run the log model
clf_log = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
# clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_log.fit(X, y_1, df.wt.values)
if collection == 'Levana Dragon Eggs':
coefs = []
for a, b in zip(std_pred_cols, clf_lin.coef_):
coefs += [[a,b]]
coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
coefs.to_csv('~/Downloads/levana_log_coefs.csv', index=False)
df['pred_log'] = clf_log.predict(X)
df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
df['err_log'] = abs(((df.pred_log - df[target_col]) / df[target_col]) )
df[[ target_col,'pred_log','err_log','mn_20' ]].sort_values('err_log').tail(50)
df['err'] = df.err_lin * df.err_log
df[[ target_col,'pred_log','err_log','err_lin','err','mn_20' ]].sort_values('err').tail(50)
df['collection'] = collection
excludedf = excludedf.append(df[df.err > 2][['collection','token_id','price']])
# df = df[df.err < 2]
print(round(len(df[df.err > 2]) * 100.0 / len(df), 2))
df[(df.err_log > 1) & (df.err_lin >= 5)]
clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_log.fit(X, y_1, df.weight.values)
clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
clf_log.fit(X, y_1, df.weight.values)
df['pred_lin'] = clf_lin.predict(X)
df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
# df['pred_log'] = np.exp(clf_log.predict(X))
df['pred_log'] = clf_log.predict(X)
df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
# combine the models
clf = LinearRegression(fit_intercept=False)
clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.weight.values )
clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.wt.values )
print('Price = {} * lin + {} * log'.format( round(clf.coef_[0], 2), round(clf.coef_[1], 2) ))
l = df.sort_values('block_timestamp', ascending=0).mn_20.values[0]
tmp = pd.DataFrame([[collection, clf.coef_[0], clf.coef_[1], l]], columns=['collection','lin_coef','log_coef','floor_price'])
@ -335,34 +292,8 @@ for collection in ['Levana Dragon Eggs']:
df['pred'] = clf.predict( df[['pred_lin','pred_log']].values )
coefsdf = coefsdf.append(tmp)
df['err'] = (df.pred / df[target_col]).apply(lambda x: abs(x-1) )
df[df.block_timestamp>='2021-10-01'].sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'token_id','block_timestamp','err','mn_20' ]]
# df[df.block_timestamp>='2021-10-01'].err.mean()
df.merge(tokens[['collection','token_id','clean_token_id']]).sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'clean_token_id','rank','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).head(20)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).tail(40)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).head(20).tx_id.values
# print(np.mean(y))
# print(np.mean(clf.predict(X)))
# # run neural net
# model = tf.keras.models.Sequential([
# tf.keras.layers.Dense(9, activation='relu')
# , tf.keras.layers.Dropout(.2)
# , tf.keras.layers.Dense(3, activation='relu')
# , tf.keras.layers.Dropout(.2)
# , tf.keras.layers.Dense(1, activation='linear')
# ])
# model.compile(loss='mae', optimizer=tf.keras.optimizers.SGD(learning_rate=0.0025))
# model.fit(X, y, epochs=500, validation_split=0.3)
# df['pred'] = np.exp( (sd * model.predict(df[std_pred_cols].values)) + mu)
# df['pred'] = model.predict(df[std_pred_cols].values)
# ratio = df.price.mean() / df.pred.mean()
# print("Manually increasing predictions by {}%".format(round((ratio-1) * 100, 1)))
# checking errors
# df['pred'] = df.pred * ratio
# print out some summary stats
df['err'] = df[target_col] - df.pred
df['q'] = df.pred.rank() * 10 / len(df)
df['q'] = df.q.apply(lambda x: int(round(x)) )
@ -373,137 +304,56 @@ for collection in ['Levana Dragon Eggs']:
df['pred_price'] = df.pred#.apply(lambda x: x*(1+pe_mu) )
df['pred_sd'] = df.pred * pe_sd
print(df.groupby('q')[['err','pred',target_col]].mean())
print(df[df.weight >= df.weight.median()].groupby('q')[['err','pred',target_col]].mean())
print(df[df.wt >= df.wt.median()].groupby('q')[['err','pred',target_col]].mean())
# df.err.mean()
# df[df.weight >= 3.5].err.mean()
df['collection'] = collection
print('Avg err last 100: {}'.format(round(df.sort_values('block_timestamp').head(100).err.mean(), 2)))
salesdf = salesdf.append( df[[ 'collection','contract_address','token_id','block_timestamp','price','pred','mn_20','rank','score' ]].sort_values('block_timestamp', ascending=0) )
salesdf = salesdf.append( df[[ 'collection','token_id','block_timestamp','price','pred','mn_20','rank' ]].sort_values('block_timestamp', ascending=0) )
# create the attributes dataframe
for f in p_features:
cur = p_metadata[[ 'token_id', f, '{}_pct'.format(f) ]]
cur.columns = [ 'token_id', 'value','rarity' ]
cur['feature'] = f
cur['collection'] = collection
attributes = attributes.append(cur)
# create predictions for each NFT in the collection
test = p_metadata.copy()
############################################################
# Create Predictions for Each NFT in The Collection #
############################################################
test = merge(num_metadata, cat_metadata, ['collection','token_id'])
for c in num_features:
test[c] = test[c].apply(lambda x: just_float(x) )
tail = df.sort_values('timestamp').tail(1)
for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
if c in tail.columns:
test[c] = tail[c].values[0]
test = standardize_df(test, [c for c in p_pred_cols if not c in ['timestamp'] ], df)
# test['pred_lin'] = clf_lin.predict( test[std_pred_cols].values )
# test['pred_log'] = np.exp(clf_log.predict( test[std_pred_cols].values ))
test = standardize_df(test, pred_cols, df)
test['pred_lin'] = clf_lin.predict(test[std_pred_cols].values)
test['pred_lin'] = test.pred_lin.apply(lambda x: max(0, x) + l)
# test['pred_lin'] = df.pred_lin + df.mn_20
# df['pred_log'] = np.exp(clf_log.predict(X))
test['pred_log'] = clf_log.predict(test[std_pred_cols].values)
test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * l
test['pred'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
# test['pred'] = np.exp( (sd * model.predict(test[std_pred_cols].values)) + mu) * ratio
test['pred_price'] = test.pred#.apply(lambda x: x*(1+pe_mu) )
test['pred_price'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
if not CHECK_EXCLUDE:
test['pred_price'] = test.pred.apply(lambda x: (x*0.985) )
test['pred_sd'] = test.pred * pe_sd
test['rk'] = test.pred.rank(ascending=0, method='first')
test['pred_price'] = test.pred_price.apply(lambda x: (x*0.985) )
test['pred_sd'] = test.pred_price * pe_sd
test['rk'] = test.pred_price.rank(ascending=0, method='first')
test['collection'] = collection
pred_price = pred_price.append( test[[ 'collection', 'contract_address','token_id','rank','rk','pred_price','pred_sd' ] + p_features].rename(columns={'rank':'hri_rank'}).sort_values('pred_price') )
# print(test[[ 'contract_address','token_id','pred_price','pred_sd' ]].sort_values('pred_price'))
pred_price = pred_price.append( test[[ 'collection','token_id','rank','rk','pred_price','pred_sd' ]].sort_values('pred_price') )
cols = metadata.feature_name.unique()
cols = [ x for x in cols if not x in (ATT_EXCLUDE_COLS[collection] if collection in ATT_EXCLUDE_COLS.keys() else []) + ALL_NUMERIC_COLS ]
exclude = RARITY_EXCLUDE_COLS[collection] if collection in RARITY_EXCLUDE_COLS.keys() else []
for c in cols:
cur = metadata[metadata.feature_name == c][['collection','token_id','feature_name','feature_value']]
if c in exclude:
cur['rarity'] = None
else:
g = cur.groupby('feature_value').token_id.count().reset_index()
g['rarity'] = g.token_id / len(cur.token_id.unique())
cur = merge(cur, g[['feature_value','rarity']])
attributes = attributes.append(cur)
##############################
# Feature Importance #
##############################
coefs = []
for a, b, c in zip(p_pred_cols, clf_lin.coef_, clf_log.coef_):
coefs += [[ collection, a, b, c ]]
coefs = pd.DataFrame(coefs, columns=['collection','col','lin_coef','log_coef'])
# coefs['feature'] = coefs.col.apply(lambda x: ' '.join(re.split('_', x)[:-1]).title() )
# coefs['feature'] = coefs.col.apply(lambda x: '_'.join(re.split('_', x)[:-1]) )
# coefs['value'] = coefs.col.apply(lambda x: re.split('_', x)[-1] )
# mn = coefs.groupby('feature')[[ 'lin_coef','log_coef' ]].min().reset_index()
# mn.columns = [ 'feature','mn_lin_coef','mn_log_coef' ]
# coefs = coefs.merge(mn)
# coefs['lin_coef'] = coefs.lin_coef - coefs.mn_lin_coef
# coefs['log_coef'] = coefs.log_coef - coefs.mn_log_coef
# coefs
# g = attributes[ attributes.collection == collection ][[ 'feature','value','rarity' ]].drop_duplicates()
# g['value'] = g.value.astype(str)
# len(coefs)
# g = coefs.merge(g, how='left')
# g[g.rarity.isnull()]
# len(g)
# coefs = coefs.merge( m_df[ m_df.collection == collection ][[ 'feature_name','' ]] )
# coefs.sort_values('lin_coef').tail(20)
# TODO: pick the most common one and have that be the baseline
most_common = attributes[(attributes.collection == collection)].sort_values('rarity', ascending=0).groupby('feature').head(1)
most_common['col'] = most_common.apply(lambda x: 'std_{}_{}'.format( re.sub(' ', '_', x['feature'].lower()), x['value'] ), 1 )
mc = most_common.col.unique()
data = []
for c0 in std_pred_cols_0:
if c0 in ['std_rank','std_score','std_pct','std_timestamp','std_mn_20','std_log_mn_20']:
continue
f = '_'.join(re.split('_', c0)[1:-1])
v = re.split('_', c0)[-1]
rarity = p_metadata[p_metadata['{}_{}'.format(f, v)]==1]['{}_pct'.format(f)].values[0]
# avg = p_metadata['{}_pct'.format(f)].mean()
# avg_pct = df.pct.mean()
# pct_std = ((avg_pct * r / avg) - avg_pct) / df.pct.std()
r = df[df['{}_{}'.format(f, v)]==1].std_rank.mean()
s = df[df['{}_{}'.format(f, v)]==1].std_score.mean()
if r == r and s == s:
datum = [ c0, rarity ]
for c1 in std_pred_cols:
datum.append(1 if c1 == c0 else r if c1 == 'std_rank' else s if c1 == 'std_score' else 1 if c1 in mc else 0 )
data += [ datum ]
importance = pd.DataFrame(data, columns=['feature','rarity']+std_pred_cols)
sorted(importance.feature.unique())
importance[importance.feature == 'std_fur_/_skin_Leopard']
if 'std_timestamp' in df.columns:
importance['std_timestamp'] = df.std_timestamp.max()
# importance['pred_lin'] = clf_lin.predict( importance[std_pred_cols].values )
# importance['pred_log'] = np.exp(clf_log.predict( importance[std_pred_cols].values ))
importance['pred_lin'] = clf_lin.predict(importance[std_pred_cols].values)
importance['pred_lin'] = importance.pred_lin.apply(lambda x: max(0, x) + l)
# importance['pred_lin'] = importance.pred_lin.apply(lambda x: x + l)
importance['pred_log'] = clf_log.predict(importance[std_pred_cols].values)
importance['pred_log'] = importance.pred_log.apply(lambda x: max(1, x)) * l
# importance['pred_log'] = importance.pred_log.apply(lambda x: x) * l
importance['pred'] = clf.predict( importance[[ 'pred_lin','pred_log' ]].values )
# importance['pred'] = np.exp( (sd * model.predict(importance[std_pred_cols].values)) + mu)
importance = importance.sort_values('pred', ascending=0)
importance.head()[['feature','pred']]
importance[importance.feature == 'std_fur_/_skin_Leopard']
importance['feature'] = importance.feature.apply(lambda x: re.sub('std_', '', x))
importance['value'] = importance.feature.apply(lambda x: re.split('_', x)[-1])
importance['feature'] = importance.feature.apply(lambda x: '_'.join(re.split('_', x)[:-1]))
mn = importance.groupby('feature').pred.min().reset_index().rename(columns={'pred':'baseline'})
importance = importance.merge(mn)
importance['pred_vs_baseline'] = importance.pred - importance.baseline
importance['pct_vs_baseline'] = (importance.pred / importance.baseline) - 1
importance[(importance.feature == 'fur_/_skin')].sort_values('pred')[['value','rarity','pred','pred_lin','pred_log','std_rank','std_score']].sort_values('rarity')
importance['collection'] = collection
importance.sort_values('pct_vs_baseline')[['feature','value','pct_vs_baseline']]
tmp = importance[std_pred_cols].mean().reset_index()
tmp.columns = [ 'a', 'b' ]
tmp = tmp.sort_values('b')
feature_values = feature_values.append(importance[['collection','feature','value','pred','pred_vs_baseline','pct_vs_baseline','rarity']])
attributes['feature'] = attributes.feature.apply(lambda x: re.sub('_', ' ', x).title() )
feature_values['feature'] = feature_values.feature.apply(lambda x: re.sub('_', ' ', x).title() )
pred_price = pred_price[[ 'collection', 'contract_address', 'token_id', 'hri_rank', 'rk', 'pred_price', 'pred_sd' ]]
attributes['feature_name'] = attributes.feature_name.apply(lambda x: re.sub('_', ' ', x).title() )
sorted(attributes['feature_name'].unique())
if len(feature_values):
feature_values['feature_name'] = feature_values.feature_name.apply(lambda x: re.sub('_', ' ', x).title() )
coefsdf.to_csv('./data/coefsdf.csv', index=False)
salesdf.to_csv('./data/model_sales.csv', index=False)
@ -511,24 +361,6 @@ pred_price.to_csv('./data/pred_price.csv', index=False)
attributes.to_csv('./data/attributes.csv', index=False)
feature_values.to_csv('./data/feature_values.csv', index=False)
pred_price = pd.read_csv('./data/pred_price.csv')
tokens = pd.read_csv('./data/tokens.csv')
rem = tokens[tokens.clean_token_id>=10000].token_id.unique()
l0 = len(pred_price)
pred_price = pred_price[ -((pred_price.collection == 'LunaBulls') & (pred_price.token_id.isin(rem))) ]
l1 = len(pred_price)
pred_price.to_csv('./data/pred_price.csv', index=False)
# excludedf.to_csv('./data/excludedf.csv', index=False)
# listings = pd.read_csv('./data/listings.csv')
# listings['token_id'] = listings.token_id.astype(int)
# tmp = salesdf.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
# tmp = pred_price.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
# tmp['token_id'] = tmp.token_id.astype(int)
# tmp = tmp.merge(listings[['collection','token_id','price']])
# tmp.sort_values('pred_price', ascending=0)
if CHECK_EXCLUDE:
salesdf['rat'] = salesdf.price / salesdf.pred
salesdf['dff'] = salesdf.price - salesdf.pred
@ -542,9 +374,3 @@ if CHECK_EXCLUDE:
print(salesdf.exclude.mean())
salesdf[salesdf.token_id == '2239'][['collection','price','exclude']]
salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]].to_csv('./data/exclude.csv', index=False)
attributes[ (attributes.collection == 'thugbirdz') & (attributes.token_id == '1869') ]
feature_values[ (feature_values.collection == 'thugbirdz') & (feature_values.feature == 'position_in_gang') ]
sorted(feature_values[ (feature_values.collection == 'thugbirdz') ].feature.unique())
pred_price[pred_price.collection == 'peskypenguinclub'].head()

574
solana_model_old.py Normal file
View File

@ -0,0 +1,574 @@
import os
import re
import json
import warnings
import requests
import numpy as np
import pandas as pd
import urllib.request
import tensorflow as tf
import snowflake.connector
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
warnings.filterwarnings('ignore')
os.chdir('/Users/kellenblumberg/git/nft-deal-score')
CHECK_EXCLUDE = False
CHECK_EXCLUDE = True
# Using sales from howrare.is - the last sale that was under 300 was when the floor was at 72. Filtering for when the floor is >100, the lowest sale was 400
###################################
# Define Helper Functions #
###################################
def standardize_df(df, cols, usedf=None, verbose=False):
for c in cols:
if type(usedf) != type(pd.DataFrame()):
usedf = df
mu = usedf[c].mean()
sd = usedf[c].std()
if verbose:
print(c)
if len(df[c].unique()) == 2 and df[c].max() == 1 and df[c].min() == 0:
df['std_{}'.format(c)] = df[c].apply(lambda x: (x*2) - 1 )
else:
df['std_{}'.format(c)] = (df[c] - mu) / sd
return(df)
def just_float(x):
x = re.sub('[^\d\.]', '', str(x))
return(float(x))
def calculate_percentages(df, cols=[]):
add_pct = not 'pct' in df.columns
if not len(cols):
cols = df.columns
if add_pct:
df['pct'] = 1
for c in cols:
g = df[c].value_counts().reset_index()
g.columns = [ c, 'N' ]
col = '{}_pct'.format(c)
g[col] = g.N / g.N.sum()
df = df.merge( g[[ c, col ]] )
if add_pct:
df['pct'] = df.pct * df[col]
return(df)
exclude = [
# (collection, token_id, price)
( 'aurory', 2239, 3500 )
# ( 'aurory', 856, 150 )
# ( 'aurory', 4715, 500 )
# ( 'aurory', 5561, 298 )
# ( 'aurory', 5900, 199 )
# ( 'aurory', 3323, 138 )
]
s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
s_df[ s_df.collection == 'Levana Dragons' ].sort_values('block_timestamp', ascending=0).head()
print(len(s_df[s_df.collection == 'Levana Dragon Eggs']))
print(s_df.groupby('collection').token_id.count())
s_df.collection.unique()
s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
for e in exclude:
s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
# exclude wierd data points
if not CHECK_EXCLUDE:
exclude = pd.read_csv('./data/exclude.csv')
s_df = s_df.merge(exclude, how='left')
s_df = s_df[s_df.exclude.isnull()]
del s_df['exclude']
m_df = pd.read_csv('./data/metadata.csv')
m_df['token_id'] = m_df.token_id.astype(str)
tmp = m_df[m_df.collection.isin(['Levana Dragon Eggs','Levana Meteors','Levana Dust'])]
tmp['tmp'] = tmp.token_id.astype(int)
tmp.groupby('collection').tmp.max()
m_df.head()
# s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d %H:%M:%S') )
s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
s_df['timestamp'] = s_df.block_timestamp.astype(int)
# del metadata['price']
# del metadata['last_sale']
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
s_df[[ 'block_timestamp','days_ago' ]].drop_duplicates(subset=['days_ago'])
s_df['av_20'] = s_df.groupby('collection')['mn_20'].rolling(20).mean().reset_index(0,drop=True)
s_df = s_df.sort_values(['collection','block_timestamp'])
# s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).median().reset_index(0,drop=True)
s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
# s_df[ (-((s_df.price) >= (s_df.md_20 * 0.2))) & (s_df.price.notnull()) & (s_df.collection == 'Levana Dragon Eggs') ]
s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.75) ]
s_df = s_df.sort_values(['collection','block_timestamp'])
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
s_df = s_df.sort_values(['collection','block_timestamp'])
# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).min().reset_index(0,drop=True)
s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.1).reset_index(0,drop=True)
s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(21).tail(40)
s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(20).sort_values('price')
s_df['tmp'] = s_df.mn_20 / s_df.md_20
tmp = s_df[s_df.collection=='smb'][['mn_20','block_timestamp']]
tmp['date'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
tmp = tmp.groupby('date').mn_20.median().reset_index()
tmp.to_csv('~/Downloads/tmp.csv', index=False)
s_df['tmp'] = s_df.price / s_df.mn_20
s_df[s_df.collection == 'smb'].sort_values('block_timestamp')[['token_id','price','mn_20']]
s_df[s_df.collection == 'smb'].sort_values('tmp').head(20)[['collection','token_id','price','mn_20','tmp']]
s_df.groupby('collection').tmp.median()
s_df.groupby('collection').tmp.mean()
s_df.sort_values('tmp').head()
s_df['tmp'] = s_df.price / s_df.mn_20
s_df[['collection','token_id','block_timestamp','price','mn_20','md_20','av_20','tmp']].to_csv('~/Downloads/tmp.csv', index=False)
s_df.groupby('collection').tmp.median()
s_df.groupby('collection').tmp.mean()
s_df.sort_values('tmp', ascending=0).head()
s_df.head(21)
m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
# m_df['feature_value'] = m_df.feature_value.apply(lambda x: x.strip() )
# m_df.feature_value.unique()
pred_cols = {}
metadata = {}
sales = {}
collection_features = {}
m_df[(m_df.collection == 'Galactic Punks') & (m_df.feature_name == 'pct')].sort_values('token_id')
c = 'Levana Dragon Eggs'
# pred_cols[c]
EXCLUDE_COLS = {
# 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
'Levana Dragon Eggs': ['meteor_id','shower','lucky_number','cracking_date','attribute_count']
}
NUMERIC_COLS = {
'Levana Dragon Eggs': ['rank','score','pct','collection_rank','weight','temperature']
}
for c in s_df.collection.unique():
print('Building {} model'.format(c))
exclude = EXCLUDE_COLS[c] if c in EXCLUDE_COLS.keys() else []
n_cols = NUMERIC_COLS[c] if c in NUMERIC_COLS.keys() else []
exclude = [ x for x in exclude if not x in n_cols ]
o_cols = sorted([x for x in m_df[ m_df.collection == c ].feature_name.unique() if (not x in exclude) and not (x in n_cols) ])
sales[c] = s_df[ s_df.collection == c ]
pred_cols[c] = sorted( n_cols + o_cols )
collection_features[c] = [ c for c in pred_cols[c] if not c in ['score','rank','pct']+exclude ]
metadata[c] = m_df[ (m_df.collection == c) & (-(m_df.feature_name.isin(exclude))) ]
# tmp = pd.pivot_table( metadata[c], ['collection','token_id'], columns=['feature_name'], values=['feature_value'] )
metadata[c] = metadata[c].pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
metadata[c].columns = [ 'collection','token_id' ] + pred_cols[c]
features = collection_features[c]
cur = metadata[c]
# cur = cur.dropna(subset=features)
for f in features:
if type(cur[f].values[0] == str):
cur[f] = cur[f].apply(lambda x: re.sub("\"", "", str(x) ) )
cur[f] = cur[f].apply(lambda x: re.split("\(", x )[0].strip())
cur = cur.replace('', 'Default')
# if not 'pct' in cur.columns:
cur = calculate_percentages( cur, o_cols )
dummies = pd.get_dummies(cur[o_cols])
# feature_cols = dummies.columns
cur = pd.concat([ cur.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
metadata[c] = cur
# pred_cols[c] = ['rank','score','timestamp','mn_20','log_mn_20'] + list(dummies.columns)
# cols = [ 'collection_rank' ]
# cols = [ ]
# pred_cols[c] = [ 'rank','transform_rank','score'] + n_cols + [x for x in cols if x in m_df.feature_name.unique()] + list(dummies.columns)
# pred_cols[c] = [ 'rank','transform_rank','score'] + n_cols + list(dummies.columns)
pred_cols[c] = n_cols + list(dummies.columns)
# collection_features = {
# 'Hashmasks': [ 'character','eyecolor','item','mask','skincolor' ]
# , 'Galactic Punks': [ 'backgrounds','hair','species','suits','jewelry','headware','glasses' ]
# , 'Solana Monkey Business': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# , 'Aurory': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# # , 'Thugbirdz': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
# }
coefsdf = pd.DataFrame()
salesdf = pd.DataFrame()
attributes = pd.DataFrame()
pred_price = pd.DataFrame()
feature_values = pd.DataFrame()
collections = sorted(metadata.keys())
collection = 'Galactic Punks'
tokens = pd.read_csv('./data/tokens.csv')
collection = 'Levana Dragon Eggs'
# for collection in s_df.collection.unique():
for collection in ['Levana Dragon Eggs']:
# collection = 'LunaBulls'
# collection = 'smb'
# collection = 'aurory'
# collection = 'meerkatmillionaires'
print('Working on collection {}'.format(collection))
p_metadata = metadata[collection]
if 'attribute_count' in p_metadata.columns:
p_metadata['attribute_count'] = p_metadata.attribute_count.astype(float).astype(int)
p_sales = sales[collection]
# specify the predictive features
p_pred_cols = pred_cols[collection]
if collection == 'Levana Dragon Eggs':
p_pred_cols += [ 'transformed_collection_rank' ]
p_features = collection_features[collection]
p_sales['token_id'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
p_metadata['token_id'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
for c in [ 'rank','score' ]:
p_metadata[c] = p_metadata[c].astype(float)
# p_sales['contract_address'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
# p_metadata['contract_address'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
p_sales['contract_address'] = ''
p_metadata['contract_address'] = ''
# remove 1 columns for each group (since they are colinear)
# exclude = []
# for f in p_features:
# e = [ c for c in p_pred_cols if c[:len(f)] == f ][-1]
# exclude.append(e)
df = p_sales.merge(p_metadata, on=['token_id','contract_address'])
df = df[df.mn_20.notnull()]
target_col = 'adj_price'
df[target_col] = df.apply(lambda x: max(0.7 * (x['mn_20'] - 0.2), x['price']), 1 )
# df['mn_20'] = df.apply(lambda x: min(x[target_col], x['mn_20']), 1 )
# tmp = df[['block_timestamp','mn_20']].copy()
# tmp['tmp'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
# tmp = tmp.groupby('tmp').mn_20.median().reset_index()
# tmp.sort_values('tmp').to_csv('~/Downloads/tmp.csv', index=False)
# df['timestamp'] = df.block_timestamp.astype(int)
df = df[df[target_col].notnull()]
df = df.reset_index(drop=True)
df['transform_rank'] = df['rank'].apply(lambda x: 1.0 / (x**2) )
df['rel_price_0'] = df[target_col] - df.mn_20
df['rel_price_1'] = df[target_col] / df.mn_20
df = df[df.mn_20 > 0]
df['log_mn_20'] = np.log(df.mn_20)
print('Training on {} sales'.format(len(df)))
# df['price_median'] = df.groupby('token_id').price.median()
# standardize columns to mean 0 sd 1
len(p_pred_cols)
n_cols = NUMERIC_COLS[collection] if collection in NUMERIC_COLS.keys() else []
for c in n_cols:
df[c] = df[c].apply(lambda x: just_float(x) )
if collection == 'Levana Dragon Eggs':
df['transformed_collection_rank'] = df.collection_rank.apply(lambda x: (1.0/ x)**2 )
df = standardize_df(df, p_pred_cols)
std_pred_cols_0 = [ 'std_{}'.format(c) for c in p_pred_cols ]
# p_pred_cols = [ c for c in p_pred_cols if not c in exclude ]
std_pred_cols = [ 'std_{}'.format(c) for c in p_pred_cols ]
df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
# df.sort_values('block_timestamp').head(10)[['price','tx_id']]
# df.sort_values('block_timestamp').head(10)[['price','tx_id']].tx_id.values
# df = df[df.price >= 1]
#########################
# Run the Model #
#########################
len(df)
len(df.dropna(subset=std_pred_cols))
tmp = df[std_pred_cols].count().reset_index()
tmp.columns = ['a','b']
tmp.sort_values('b').head(20)
rem = list(tmp[tmp.b==0].a.values)
std_pred_cols = [ c for c in std_pred_cols if not c in rem ]
mn = df.timestamp.min()
mx = df.timestamp.max()
df['weight'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
X = df[std_pred_cols].values
mu = df.log_price.mean()
sd = df.log_price.std()
df['std_log_price'] = (df.log_price - mu) / sd
# y = df.std_log_price.values
# y = df[target_col].values
# y = df.rel_price_1.values
y_0 = df.rel_price_0.values
y_1 = df.rel_price_1.values
# y_log = df.log_price.values
clf_lin = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
clf_lin.fit(X, y_0, df.weight.values)
coefs = []
for a, b in zip(std_pred_cols, clf_lin.coef_):
coefs += [[a,b]]
coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
coefs.to_csv('~/Downloads/tmp.csv', index=False)
df['pred_lin'] = clf_lin.predict(X)
df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
df['err_lin'] = abs(((df.pred_lin - df[target_col]) / df[target_col]) )
# df['err_lin'] = abs(df.pred_lin - df.price )
# df[[ 'price','pred_lin','err_lin','mn_20' ]].sort_values('err_lin').tail(50)
df.head()
clf_log = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
clf_log.fit(X, y_1, df.weight.values)
coefs = []
for a, b in zip(std_pred_cols, clf_log.coef_):
coefs += [[a,b]]
coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
coefs.to_csv('~/Downloads/tmp.csv', index=False)
df['pred_log'] = clf_log.predict(X)
df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
df['err_log'] = abs(((df.pred_log - df[target_col]) / df[target_col]) )
df[[ target_col,'pred_log','err_log','mn_20' ]].sort_values('err_log').tail(50)
df['err'] = df.err_lin * df.err_log
df[[ target_col,'pred_log','err_log','err_lin','err','mn_20' ]].sort_values('err').tail(50)
df['collection'] = collection
# df['pred_lin'] = clf_lin.predict(X)
# df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
# df['pred_log'] = np.exp(clf_log.predict(X))
# df['pred_log'] = clf_log.predict(X)
# df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
clf = LinearRegression(fit_intercept=False)
clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.weight.values )
print('Price = {} * lin + {} * log'.format( round(clf.coef_[0], 2), round(clf.coef_[1], 2) ))
l = df.sort_values('block_timestamp', ascending=0).mn_20.values[0]
tmp = pd.DataFrame([[collection, clf.coef_[0], clf.coef_[1], l]], columns=['collection','lin_coef','log_coef','floor_price'])
if clf.coef_[0] < 0:
print('Only using log')
df['pred'] = df.pred_log
tmp['lin_coef'] = 0
tmp['log_coef'] = 1
elif clf.coef_[1] < 0:
print('Only using lin')
df['pred'] = df.pred_lin
tmp['lin_coef'] = 1
tmp['log_coef'] = 0
else:
print('Only using BOTH!')
df['pred'] = clf.predict( df[['pred_lin','pred_log']].values )
coefsdf = coefsdf.append(tmp)
df['err'] = (df.pred / df[target_col]).apply(lambda x: abs(x-1) )
df[df.block_timestamp>='2021-10-01'].sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'token_id','block_timestamp','err','mn_20' ]]
# df[df.block_timestamp>='2021-10-01'].err.mean()
df.merge(tokens[['collection','token_id','clean_token_id']]).sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'clean_token_id','rank','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).head(20)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).tail(40)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
df.sort_values('price', ascending=0).head(20).tx_id.values
# print(np.mean(y))
# print(np.mean(clf.predict(X)))
# # run neural net
# model = tf.keras.models.Sequential([
# tf.keras.layers.Dense(9, activation='relu')
# , tf.keras.layers.Dropout(.2)
# , tf.keras.layers.Dense(3, activation='relu')
# , tf.keras.layers.Dropout(.2)
# , tf.keras.layers.Dense(1, activation='linear')
# ])
# model.compile(loss='mae', optimizer=tf.keras.optimizers.SGD(learning_rate=0.0025))
# model.fit(X, y, epochs=500, validation_split=0.3)
# df['pred'] = np.exp( (sd * model.predict(df[std_pred_cols].values)) + mu)
# df['pred'] = model.predict(df[std_pred_cols].values)
# ratio = df.price.mean() / df.pred.mean()
# print("Manually increasing predictions by {}%".format(round((ratio-1) * 100, 1)))
# checking errors
# df['pred'] = df.pred * ratio
df['err'] = df[target_col] - df.pred
df['q'] = df.pred.rank() * 10 / len(df)
df['q'] = df.q.apply(lambda x: int(round(x)) )
df['pct_err'] = (df[target_col] / df.pred) - 1
pe_mu = df.pct_err.mean()
pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) ].pct_err.std()
pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) & (df.days_ago<=50) ].pct_err.std()
df['pred_price'] = df.pred#.apply(lambda x: x*(1+pe_mu) )
df['pred_sd'] = df.pred * pe_sd
print(df.groupby('q')[['err','pred',target_col]].mean())
print(df[df.weight >= df.weight.median()].groupby('q')[['err','pred',target_col]].mean())
# df.err.mean()
# df[df.weight >= 3.5].err.mean()
df['collection'] = collection
print('Avg err last 100: {}'.format(round(df.sort_values('block_timestamp').head(100).err.mean(), 2)))
salesdf = salesdf.append( df[[ 'collection','contract_address','token_id','block_timestamp','price','pred','mn_20','rank','score' ]].sort_values('block_timestamp', ascending=0) )
# create the attributes dataframe
for f in p_features:
if f and '{}_pct'.format(f) in p_metadata.columns:
cur = p_metadata[[ 'token_id', f, '{}_pct'.format(f) ]]
cur.columns = [ 'token_id', 'value','rarity' ]
cur['feature'] = f
cur['collection'] = collection
attributes = attributes.append(cur)
# create predictions for each NFT in the collection
test = p_metadata.copy()
for c in n_cols:
test[c] = test[c].apply(lambda x: just_float(x) )
if collection in [ 'Levana Dragon Eggs' ]:
test['transformed_collection_rank'] = test.collection_rank.apply(lambda x: (1.0 / x) ** 2 )
tail = df.sort_values('timestamp').tail(1)
for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
if c in tail.columns:
test[c] = tail[c].values[0]
test = standardize_df(test, [c for c in p_pred_cols if not c in ['timestamp'] ], df, True)
# test['pred_lin'] = clf_lin.predict( test[std_pred_cols].values )
# test['pred_log'] = np.exp(clf_log.predict( test[std_pred_cols].values ))
test['pred_lin'] = clf_lin.predict(test[std_pred_cols].values)
test['pred_lin'] = test.pred_lin.apply(lambda x: max(0, x) + l)
# test['pred_lin'] = df.pred_lin + df.mn_20
# df['pred_log'] = np.exp(clf_log.predict(X))
test['pred_log'] = clf_log.predict(test[std_pred_cols].values)
test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * l
test['pred'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
# test['pred'] = np.exp( (sd * model.predict(test[std_pred_cols].values)) + mu) * ratio
test['pred_price'] = test.pred#.apply(lambda x: x*(1+pe_mu) )
if not CHECK_EXCLUDE:
test['pred_price'] = test.pred.apply(lambda x: (x*0.985) )
test['pred_sd'] = test.pred * pe_sd
test['rk'] = test.pred.rank(ascending=0, method='first')
test['collection'] = collection
pred_price = pred_price.append( test[[ 'collection', 'contract_address','token_id','rank','rk','pred_price','pred_sd' ] + p_features].rename(columns={'rank':'hri_rank'}).sort_values('pred_price') )
# print(test[[ 'contract_address','token_id','pred_price','pred_sd' ]].sort_values('pred_price'))
##############################
# Feature Importance #
##############################
coefs = []
for a, b, c in zip(p_pred_cols, clf_lin.coef_, clf_log.coef_):
coefs += [[ collection, a, b, c ]]
coefs = pd.DataFrame(coefs, columns=['collection','col','lin_coef','log_coef'])
# coefs['feature'] = coefs.col.apply(lambda x: ' '.join(re.split('_', x)[:-1]).title() )
# coefs['feature'] = coefs.col.apply(lambda x: '_'.join(re.split('_', x)[:-1]) )
# coefs['value'] = coefs.col.apply(lambda x: re.split('_', x)[-1] )
# mn = coefs.groupby('feature')[[ 'lin_coef','log_coef' ]].min().reset_index()
# mn.columns = [ 'feature','mn_lin_coef','mn_log_coef' ]
# coefs = coefs.merge(mn)
# coefs['lin_coef'] = coefs.lin_coef - coefs.mn_lin_coef
# coefs['log_coef'] = coefs.log_coef - coefs.mn_log_coef
# coefs
# g = attributes[ attributes.collection == collection ][[ 'feature','value','rarity' ]].drop_duplicates()
# g['value'] = g.value.astype(str)
# len(coefs)
# g = coefs.merge(g, how='left')
# g[g.rarity.isnull()]
# len(g)
# coefs = coefs.merge( m_df[ m_df.collection == collection ][[ 'feature_name','' ]] )
# coefs.sort_values('lin_coef').tail(20)
# TODO: pick the most common one and have that be the baseline
most_common = attributes[(attributes.collection == collection)].sort_values('rarity', ascending=0).groupby('feature').head(1)
most_common['col'] = most_common.apply(lambda x: 'std_{}_{}'.format( re.sub(' ', '_', x['feature'].lower()), x['value'] ), 1 )
mc = most_common.col.unique()
data = []
for c0 in std_pred_cols_0:
if c0 in ['std_rank','std_score','std_pct','std_timestamp','std_mn_20','std_log_mn_20']:
continue
f = '_'.join(re.split('_', c0)[1:-1])
v = re.split('_', c0)[-1]
rarity = p_metadata[p_metadata['{}_{}'.format(f, v)]==1]['{}_pct'.format(f)].values[0]
# avg = p_metadata['{}_pct'.format(f)].mean()
# avg_pct = df.pct.mean()
# pct_std = ((avg_pct * r / avg) - avg_pct) / df.pct.std()
r = df[df['{}_{}'.format(f, v)]==1].std_rank.mean()
s = df[df['{}_{}'.format(f, v)]==1].std_score.mean()
if r == r and s == s:
datum = [ c0, rarity ]
for c1 in std_pred_cols:
datum.append(1 if c1 == c0 else r if c1 == 'std_rank' else s if c1 == 'std_score' else 1 if c1 in mc else 0 )
data += [ datum ]
importance = pd.DataFrame(data, columns=['feature','rarity']+std_pred_cols)
sorted(importance.feature.unique())
importance[importance.feature == 'std_fur_/_skin_Leopard']
if 'std_timestamp' in df.columns:
importance['std_timestamp'] = df.std_timestamp.max()
# importance['pred_lin'] = clf_lin.predict( importance[std_pred_cols].values )
# importance['pred_log'] = np.exp(clf_log.predict( importance[std_pred_cols].values ))
importance['pred_lin'] = clf_lin.predict(importance[std_pred_cols].values)
importance['pred_lin'] = importance.pred_lin.apply(lambda x: max(0, x) + l)
# importance['pred_lin'] = importance.pred_lin.apply(lambda x: x + l)
importance['pred_log'] = clf_log.predict(importance[std_pred_cols].values)
importance['pred_log'] = importance.pred_log.apply(lambda x: max(1, x)) * l
# importance['pred_log'] = importance.pred_log.apply(lambda x: x) * l
importance['pred'] = clf.predict( importance[[ 'pred_lin','pred_log' ]].values )
# importance['pred'] = np.exp( (sd * model.predict(importance[std_pred_cols].values)) + mu)
importance = importance.sort_values('pred', ascending=0)
importance.head()[['feature','pred']]
importance[importance.feature == 'std_fur_/_skin_Leopard']
importance['feature'] = importance.feature.apply(lambda x: re.sub('std_', '', x))
importance['value'] = importance.feature.apply(lambda x: re.split('_', x)[-1])
importance['feature'] = importance.feature.apply(lambda x: '_'.join(re.split('_', x)[:-1]))
mn = importance.groupby('feature').pred.min().reset_index().rename(columns={'pred':'baseline'})
importance = importance.merge(mn)
importance['pred_vs_baseline'] = importance.pred - importance.baseline
importance['pct_vs_baseline'] = (importance.pred / importance.baseline) - 1
importance[(importance.feature == 'fur_/_skin')].sort_values('pred')[['value','rarity','pred','pred_lin','pred_log','std_rank','std_score']].sort_values('rarity')
importance['collection'] = collection
importance.sort_values('pct_vs_baseline')[['feature','value','pct_vs_baseline']]
tmp = importance[std_pred_cols].mean().reset_index()
tmp.columns = [ 'a', 'b' ]
tmp = tmp.sort_values('b')
feature_values = feature_values.append(importance[['collection','feature','value','pred','pred_vs_baseline','pct_vs_baseline','rarity']])
attributes['feature'] = attributes.feature.apply(lambda x: re.sub('_', ' ', x).title() )
feature_values['feature'] = feature_values.feature.apply(lambda x: re.sub('_', ' ', x).title() )
pred_price = pred_price[[ 'collection', 'contract_address', 'token_id', 'hri_rank', 'rk', 'pred_price', 'pred_sd' ]]
coefsdf.to_csv('./data/coefsdf.csv', index=False)
salesdf.to_csv('./data/model_sales.csv', index=False)
pred_price.to_csv('./data/pred_price.csv', index=False)
attributes.to_csv('./data/attributes.csv', index=False)
feature_values.to_csv('./data/feature_values.csv', index=False)
pred_price = pd.read_csv('./data/pred_price.csv')
tokens = pd.read_csv('./data/tokens.csv')
rem = tokens[tokens.clean_token_id>=10000].token_id.unique()
l0 = len(pred_price)
pred_price = pred_price[ -((pred_price.collection == 'LunaBulls') & (pred_price.token_id.isin(rem))) ]
l1 = len(pred_price)
pred_price.to_csv('./data/pred_price.csv', index=False)
# listings = pd.read_csv('./data/listings.csv')
# listings['token_id'] = listings.token_id.astype(int)
# tmp = salesdf.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
# tmp = pred_price.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
# tmp['token_id'] = tmp.token_id.astype(int)
# tmp = tmp.merge(listings[['collection','token_id','price']])
# tmp.sort_values('pred_price', ascending=0)
if CHECK_EXCLUDE:
salesdf['rat'] = salesdf.price / salesdf.pred
salesdf['dff'] = salesdf.price - salesdf.pred
salesdf['exclude_1'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
salesdf['rat'] = salesdf.pred / salesdf.price
salesdf['dff'] = salesdf.pred - salesdf.price
salesdf['exclude_2'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
salesdf['exclude'] = (salesdf.exclude_1 + salesdf.exclude_2).apply(lambda x: int(x>0))
print(salesdf.exclude_1.mean())
print(salesdf.exclude_2.mean())
print(salesdf.exclude.mean())
salesdf[salesdf.token_id == '2239'][['collection','price','exclude']]
salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]].to_csv('./data/exclude.csv', index=False)
attributes[ (attributes.collection == 'thugbirdz') & (attributes.token_id == '1869') ]
feature_values[ (feature_values.collection == 'thugbirdz') & (feature_values.feature == 'position_in_gang') ]
sorted(feature_values[ (feature_values.collection == 'thugbirdz') ].feature.unique())
pred_price[pred_price.collection == 'peskypenguinclub'].head()

View File

@ -45,7 +45,7 @@ server <- function(input, output, session) {
selectInput(
inputId = 'collectionname'
, label = NULL
, selected = 'LunaBulls'
, selected = 'Levana Dragon Eggs'
, choices = choices
, width = "100%"
)
@ -197,24 +197,27 @@ server <- function(input, output, session) {
return(head(attributes, 0))
}
cur <- attributes[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]
cur <- merge( cur, feature_values[collection == eval(selected), list(feature, value, pred_vs_baseline, pct_vs_baseline) ], all.x=TRUE )
# cur <- merge( cur, feature_values[collection == eval(selected), list(feature_name, feature_value, pred_vs_baseline, pct_vs_baseline) ], all.x=TRUE )
cur <- cur[order(rarity)]
floor <- getFloors()[2]
log_coef <- coefsdf[ collection == eval(selected) ]$log_coef[1]
lin_coef <- coefsdf[ collection == eval(selected) ]$lin_coef[1]
s <- sum(cur$pct_vs_baseline)
p <- getPredPrice()
p <- as.numeric(p[ token_id == eval(as.numeric(id)) ]$pred_price)
# p <- pred_price[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]$pred_price
ratio <- (p / floor) - 1
ratio <- pmax(0, ratio)
if (ratio > 0 & length(ratio) > 0) {
mult <- ratio / s
cur[, pct_vs_baseline := pct_vs_baseline * eval(mult) ]
}
cur[, vs_baseline := round((pred_vs_baseline * eval(lin_coef)) + (pct_vs_baseline * eval(floor) * eval(log_coef) ), 1) ]
cur[, pred_vs_baseline := round(pred_vs_baseline, 1) ]
cur[, vs_baseline := round(pred_vs_baseline + (pct_vs_baseline * eval(floor)), 1) ]
# floor <- getFloors()[2]
# log_coef <- coefsdf[ collection == eval(selected) ]$log_coef[1]
# lin_coef <- coefsdf[ collection == eval(selected) ]$lin_coef[1]
# s <- sum(cur$pct_vs_baseline)
# p <- getPredPrice()
# p <- as.numeric(p[ token_id == eval(as.numeric(id)) ]$pred_price)
# # p <- pred_price[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]$pred_price
# ratio <- (p / floor) - 1
# ratio <- pmax(0, ratio)
# if (ratio > 0 & length(ratio) > 0) {
# mult <- ratio / s
# cur[, pct_vs_baseline := pct_vs_baseline * eval(mult) ]
# }
cur[, vs_baseline := 0 ]
cur[, pred_vs_baseline := 0 ]
cur[, vs_baseline := 0 ]
# cur[, vs_baseline := round((pred_vs_baseline * eval(lin_coef)) + (pct_vs_baseline * eval(floor) * eval(log_coef) ), 1) ]
# cur[, pred_vs_baseline := round(pred_vs_baseline, 1) ]
# cur[, vs_baseline := round(pred_vs_baseline + (pct_vs_baseline * eval(floor)), 1) ]
return(cur)
})
@ -223,9 +226,11 @@ server <- function(input, output, session) {
if( nrow(data) == 0 ) {
return(NULL)
}
data[, rarity := paste0(format(round(rarity*100, 2), digits=4, decimal.mark="."),'%') ]
data[, rarity := ifelse(is.na(rarity), '', paste0(format(round(rarity*100, 2), digits=4, decimal.mark="."),'%') )]
# reactable(data[, list( feature, value, rarity, vs_baseline, pred_vs_baseline, pct_vs_baseline )],
data <- data[, list( feature, value, rarity, pct_vs_baseline )]
# data <- data[, list( feature, value, rarity, pct_vs_baseline )]
data <- data[, list( feature_name, feature_value, rarity )]
reactable(data,
defaultColDef = colDef(
headerStyle = list(background = "#10151A")
@ -234,17 +239,17 @@ server <- function(input, output, session) {
borderless = TRUE,
outlined = FALSE,
columns = list(
feature = colDef(name = "Attribute", align = "left"),
value = colDef(name = "Value", align = "left"),
rarity = colDef(name = "Rarity", align = "left"),
pct_vs_baseline = colDef(
name="Value", header=with_tooltip("Value", "The estimated price impact of this feature vs the floor")
, html = TRUE
, align = "left"
, cell = function(x) {
htmltools::tags$span(paste0('+', format(round(x*1000)/10, digits=4, decimal.mark=".", big.mark=","), '%'))
}
)
feature_name = colDef(name = "Attribute", align = "left"),
feature_value = colDef(name = "Value", align = "left"),
rarity = colDef(name = "Rarity", align = "left")
# pct_vs_baseline = colDef(
# name="Value", header=with_tooltip("Value", "The estimated price impact of this feature vs the floor")
# , html = TRUE
# , align = "left"
# , cell = function(x) {
# htmltools::tags$span(paste0('+', format(round(x*1000)/10, digits=4, decimal.mark=".", big.mark=","), '%'))
# }
# )
)
)
})
@ -255,7 +260,7 @@ server <- function(input, output, session) {
return(NULL)
}
data <- feature_values[ collection == eval(selected)]
reactable(data[, list( feature, value, rarity, pct_vs_baseline )],
reactable(data[, list( feature_name, feature_value, rarity, pct_vs_baseline )],
defaultColDef = colDef(
headerStyle = list(background = "#10151A")
),
@ -263,8 +268,8 @@ server <- function(input, output, session) {
outlined = FALSE,
searchable = TRUE,
columns = list(
feature = colDef(name = "Attribute", align = "left"),
value = colDef(name = "Value", align = "left"),
feature_name = colDef(name = "Attribute", align = "left"),
feature_value = colDef(name = "Value", align = "left"),
rarity = colDef(name = "Rarity", align = "left", cell = function(x) {
htmltools::tags$span(paste0(format(x*100, digits=3, decimal.mark=".", big.mark=","),'%'))
}),
@ -504,7 +509,9 @@ server <- function(input, output, session) {
df[, deal_score := round(pmin( 100, pmax(0, deal_score) )) ]
df[, deal_score := pnorm(price, pred_price, eval(SD_SCALE) * pred_sd * pred_price / pred_price_0), by = seq_len(nrow(df)) ]
df[, deal_score := round(100 * (1 - deal_score)) ]
df[, pred_price := round(pred_price) ]
# df[, pred_price := round(pred_price) ]
df[, pred_price := paste0(format(round(pred_price, 1), digits=3, decimal.mark=".", big.mark=",")) ]
df <- df[, list(token_id, price, pred_price, deal_score)]
df <- df[order(-deal_score)]
return(df)
@ -517,7 +524,7 @@ server <- function(input, output, session) {
if( nrow(df) == 0 ) {
return(NULL)
}
df <- df[ deal_score >= 10 ]
df <- df[ deal_score >= 0 ]
df[, hover_text := paste0('<b>#',token_id,'</b><br>Listing Price: ',price,'<br>Fair Market Price: ',pred_price,'<br>Deal Score: ',deal_score) ]
fig <- plot_ly(

View File

@ -102,17 +102,33 @@ fluidPage(
, fluidRow(
class="grey8row"
, h2("Listings", icon(class="padding-left-10", id="listings-tooltip", "info-circle"))
, bsTooltip(id = "listings-tooltip", title = "Plot only shows listings with deal score > 10; Click a dot to select the token", placement = "bottom", trigger = "hover")
, bsTooltip(id = "listings-tooltip", title = "Plot only shows listings with deal score > 5; Click a dot to select the token", placement = "bottom", trigger = "hover")
, div(
class = "listing-plot"
, plotlyOutput("listingplot", height = 500)
, div(class='description', 'Plot only shows listings with deal score > 10')
, div(class='description', 'Plot only shows listings with deal score > 5')
, div(class='description', 'Click a dot to select the token')
)
, div(class = "table", reactableOutput("listingtable"))
, div(class = "description", 'This app is still in beta - listings updates will be periodic (but at least 3x a week)')
, div(class = "link", uiOutput('listingurl'))
)
, fluidRow(
class="grey8row faq"
, h2("FAQ")
, h4("What is NFT Deal Score?")
, div("We use historical sales data to determine the values and the rankings of each NFT.")
, h4("Why is this rank different?")
, div("Although rarity is a feature in our model, it is not just a rarity-based ranking. Certain features are put at a higher premium on the secondary marketplace, and this ranking reflects that.")
, h4("Why are the rarity %s different?")
, div("Our %s reflect only the NFTs in existence. Other tools may include more theoretical numbers.")
, h4("How does the model work?")
, div("Each attribute is an input into the model. We are working to add better model explanations to the tool.")
, h4("How often is the data updated?")
, div("Listings are updated 3x / week. Model is updated weekly.")
, h4("Where can I send my questions?")
, div(a(class="", href="https://twitter.com/nftdealscore", "@nftdealscore"), " on Twitter")
)
, fluidRow(
class="grey8row"
, h2("NFT Rankings", icon(class="padding-left-10", id="nft-rankings-tooltip", "info-circle"))
@ -127,11 +143,11 @@ fluidPage(
, div(class = "table", reactableOutput("salestable"))
, div(class = "description", 'This app is still in beta - sales data may be incomplete or delayed')
)
, fluidRow(
class="grey8row"
, h2("Feature Summary", icon(class="padding-left-10", id="feature-summary-tooltip", "info-circle"))
, bsTooltip(id = "feature-summary-tooltip", title = "Shows the rarity and estimated price impact of each feature", placement = "bottom", trigger = "hover")
, div(class = "table", reactableOutput("featurestable"))
, div(class = "description", 'Shows the rarity and estimated price impact of each feature')
)
# , fluidRow(
# class="grey8row"
# , h2("Feature Summary", icon(class="padding-left-10", id="feature-summary-tooltip", "info-circle"))
# , bsTooltip(id = "feature-summary-tooltip", title = "Shows the rarity and estimated price impact of each feature", placement = "bottom", trigger = "hover")
# , div(class = "table", reactableOutput("featurestable"))
# , div(class = "description", 'Shows the rarity and estimated price impact of each feature')
# )
)

View File

@ -264,6 +264,15 @@ tr {
}
/****************/
/* FAQ */
/****************/
.faq > h4 {
font-size: 22px;
padding-top: 32px;
}
/*******************/
/* General */
/*******************/