levana

2026-02-06 10:56:58 +00:00 · 2021-12-23 12:00:31 -08:00 · 2021-12-23 12:00:31 -08:00 · 9dd1d71538
commit 9dd1d71538
parent e14a6b539b
10 changed files with 887 additions and 427 deletions
--- a/load_data.py
+++ b/load_data.py
@ -55,8 +55,10 @@ def run_queries():
 		metadata = ctx.cursor().execute(' '.join(query))
 		metadata = pd.DataFrame.from_records(iter(metadata), columns=[x[0] for x in metadata.description])
 		metadata = clean_colnames(metadata)
+		metadata['image'] = metadata.image.apply(lambda x: 'https://cloudflare-ipfs.com/ipfs/'+re.split('/', x)[-1] )
 		metadata['collection'] = c
 		metadata['chain'] = 'Terra'
+		list(metadata.image.values[:2]) + list(metadata.image.values[-2:])
 		metadata.to_csv('./data/metadata/{}.csv'.format(c), index=False)
 		# old = pd.read_csv('./data/metadata.csv')
 		# old = old[-old.collection.isin(metadata.collection.unique())]
@ -75,7 +77,7 @@ def add_terra_tokens():
 		, msg_value:execute_msg:mint_nft:extension:name AS name
 		, msg_value:execute_msg:mint_nft:extension:image AS image
 		FROM terra.msgs 
-		WHERE msg_value:contract::string = 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2'
+		WHERE msg_value:contract::string = 'terra16wuzgsx3tz4hkqu73q5s7unxenefkkvefvewsh'
 		AND tx_status = 'SUCCEEDED'
 		AND msg_value:execute_msg:mint_nft is not null
 	'''
@ -164,6 +166,9 @@ def add_terra_metadata():
 		metadata['attribute_count'] = 0
 		l = len(metadata)
 		incl_att_count = not collection in [ 'Levana Dragon Eggs' ]
+		metadata.groupby('cracking_date').token_id.count()
+		metadata.groupby('weight').token_id.count()
+		metadata[metadata.cracking_date=='2471-12-22'][['token_id']]
 		for c in list(metadata.columns) + ['attribute_count']:
 			if c in ['token_id','collection','pct','levana_rank','meteor_id']:
 				continue
@ -186,11 +191,18 @@ def add_terra_metadata():
 		# metadata.sort_values('pct_rank')
 		metadata.sort_values('pct')
 		metadata['rank'] = metadata.pct.rank()
-		metadata['score'] = metadata.pct.apply(lambda x: 1.0 / x )
-		mn = metadata.score.min()
-		metadata['score'] = metadata.score.apply(lambda x: x / mn )
-		metadata.score.max()
-		metadata.sort_values('rank')[['rank','pct','score']]
+		metadata['rarity_score'] = metadata.pct.apply(lambda x: 1.0 / (x**0.2) )
+		mn = metadata.rarity_score.min()
+		mx = metadata.rarity_score.max()
+		metadata['rarity_score'] = metadata.rarity_score.apply(lambda x: round(((x - mn) * 999 / (mx - mn)) + 1) )
+		metadata.sort_values('rarity_score', ascending=0).head(20)[['token_id','collection_rank','rarity_score']]
+		metadata.sort_values('rarity_score', ascending=0).tail(20)[['token_id','collection_rank','rarity_score']]
+		metadata[metadata.token_id==6157].sort_values('rarity_score', ascending=0).tail(20)[['token_id','collection_rank','rarity_score','rank']]
+		metadata[metadata['rank']>=3000].groupby('weight').token_id.count()
+
+		metadata.rarity_score.max()
+		metadata.rarity_score.min()
+		metadata.sort_values('rank')[['rank','pct','rarity_score']]

 		m = pd.DataFrame()
 		for c in metadata.columns:
@ -201,16 +213,20 @@ def add_terra_metadata():
 			m = m.append(cur)
 		m['chain'] = 'Terra'
 		m.groupby('feature_name').feature_value.count()
-		m[m.feature_name=='face'].groupby('feature_value').token_id.count()
-		print(len(m.token_id.unique()))
+		if collection == 'Levana Dragon Eggs':
+			add = m[m.feature_name=='collection_rank']
+			add['feature_name'] = 'transformed_collection_rank'
+			add['feature_value'] = add.feature_value.apply(lambda x: (1.0/ (x + 0.5))**1 )
+			m = m.append(add)
 		g = m.groupby('feature_value').feature_name.count().reset_index().sort_values('feature_name').tail(50)
 		old = pd.read_csv('./data/metadata.csv')
 		if not 'chain' in old.columns:
 			old['chain'] = old.collection.apply(lambda x: 'Terra' if x in [ 'Galactic Punks', 'LunaBulls' ] else 'Solana' )
 		old = old[-old.collection.isin(m.collection.unique())]
 		old = old.append(m)
-		old = old.drop_duplicates()
-		print(old.groupby(['chain','collection']).token_id.count())
+		old = old.drop_duplicates(subset=['collection','token_id','feature_name'])
+		old = old[-(old.feature_name.isin(['last_sale']))]
+		# print(old.groupby(['chain','collection']).token_id.count())
 		print(old[['chain','collection','token_id']].drop_duplicates().groupby(['chain','collection']).token_id.count())
 		old.to_csv('./data/metadata.csv', index=False)

--- a/metadata/sql/Levana
+++ b/metadata/sql/Levana
@ -4,8 +4,8 @@ WITH legendary_traits AS (
        block_id,
        tx_id,
        msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
-        msg_value:execute_msg:mint:token_id::string                        as tokenid,
+        msg_value:execute_msg:mint:extension:image::string as image,
+        msg_value:execute_msg:mint:token_id::string                       as tokenid,
        msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
        msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
        msg_value:execute_msg:mint:extension:attributes[2]:value::string  as origin,
@ -39,7 +39,7 @@ WITH legendary_traits AS (
        block_id,
        tx_id,
        msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
        msg_value:execute_msg:mint:token_id::string                        as tokenid,
        msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
        msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -74,7 +74,7 @@ WITH legendary_traits AS (
        block_id,
        tx_id,
        msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
        msg_value:execute_msg:mint:token_id::string                        as tokenid,
        msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
        msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -109,7 +109,7 @@ WITH legendary_traits AS (
        block_id,
        tx_id,
        msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
        msg_value:execute_msg:mint:token_id::string                        as tokenid,
        msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
        msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
--- a/metadata/sql/Levana
+++ b/metadata/sql/Levana
@ -5,7 +5,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -38,7 +38,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -74,7 +74,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -109,7 +109,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -144,7 +144,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
@ -179,7 +179,7 @@ select   block_timestamp,
         block_id,
         tx_id,
         msg_value:execute_msg:mint:extension:name::string                 as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
         msg_value:execute_msg:mint:token_id::string                        as token_id,
         msg_value:execute_msg:mint:extension:attributes[0]:value::string  as rarity,
         msg_value:execute_msg:mint:extension:attributes[1]:value::string  as rank,
--- a/metadata/sql/Levana
+++ b/metadata/sql/Levana
@ -4,7 +4,7 @@ select  block_timestamp,
        block_id,
        tx_id,
        msg_value:execute_msg:mint:extension:name::string                as name,
-        CONCAT('https://d75aawrtvbfp1.cloudfront.net/',msg_value:execute_msg:mint:extension:image::string) as image,
+        msg_value:execute_msg:mint:extension:image::string as image,
        msg_value:execute_msg:mint:token_id::string                       as token_id,
        msg_value:execute_msg:mint:extension:attributes[0]:value::string as rarity,
        msg_value:execute_msg:mint:extension:attributes[1]:value::string as rank,
--- a/scrape_sol_nfts.py
+++ b/scrape_sol_nfts.py
@ -39,16 +39,18 @@ def clean_name(name):
 def scrape_randomearth():
 	d_address = {
 		'Galactic Punks': 'terra103z9cnqm8psy0nyxqtugg6m7xnwvlkqdzm4s4k',
-		'LunaBulls': 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2'
+		'LunaBulls': 'terra1trn7mhgc9e2wfkm5mhr65p3eu7a2lc526uwny2',
+		'Levana Dragon Eggs': 'terra1k0y373yxqne22pc9g7jvnr4qclpsxtafevtrpg',
 	}
 	data = []
-	for collection in [ 'Galactic Punks', 'LunaBulls' ]:
+	# for collection in [ 'Levana Dragon Eggs' ]:
+	for collection in d_address.keys():
 		print(collection)
 		page = 0
 		has_more = True
 		while has_more:
 			page += 1
-			print('Page #{}'.format(page))
+			print('Page #{} ({})'.format(page, len(data)))
 			url = 'https://randomearth.io/api/items?collection_addr={}&sort=price.asc&page={}&on_sale=1'.format( d_address[collection], page)
 			browser.get(url)
 			soup = BeautifulSoup(browser.page_source)
@ -59,6 +61,7 @@ def scrape_randomearth():
 				for i in j['items']:
 					data += [[ 'Terra', collection, i['token_id'], i['price'] / (10 ** 6) ]]
 		df = pd.DataFrame(data, columns=['chain','collection','token_id','price'])
+		df.to_csv('~/Downloads/tmp.csv', index=False)
 		old = pd.read_csv('./data/listings.csv')
 		old = old[-old.collection.isin(df.collection.unique())]
 		old = old.append(df)
@ -189,9 +192,12 @@ def convert_collection_names():
 		,'boryokudragonz': 'Boryoku Dragonz'
 	}
 	for c in [ 'pred_price', 'attributes', 'feature_values', 'model_sales', 'listings', 'coefsdf', 'tokens' ]:
-		df = pd.read_csv('./data/{}.csv'.format(c))
-		df['collection'] = df.collection.apply(lambda x: clean_name(x) if x in d.keys() else x )
-		df.to_csv('./data/{}.csv'.format(c), index=False)
+		try:
+			df = pd.read_csv('./data/{}.csv'.format(c))
+			df['collection'] = df.collection.apply(lambda x: clean_name(x) if x in d.keys() else x )
+			df.to_csv('./data/{}.csv'.format(c), index=False)
+		except:
+			pass

 def scrape_recent_sales():
 	o_sales = pd.read_csv('./data/sales.csv')
@ -234,6 +240,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
 		, 'degenapes': 'degen-ape-academy'
 		, 'peskypenguinclub': 'pesky-penguins'
 	}
+	collection = 'smb'
 	for collection in collections:
 		if collection == 'boryokudragonz':
 			continue
@ -249,7 +256,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk
 			print('{} page #{} ({})'.format(collection, page, len(data)))
 			sleep(3)
 			page += 1
-			for j in [25, 30, 35, 30, 25] * 2:
+			for j in [20, 30, 30, 30, 30, 30, 30, 30] * 1:
 				for _ in range(1):
 					soup = BeautifulSoup(browser.page_source)
 					# for row in browser.find_elements_by_class_name('ag-row'):
@ -325,6 +332,7 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk

 	pred_price = pd.read_csv('./data/pred_price.csv')[['collection','token_id','pred_price','pred_sd']]
 	pred_price['collection'] = pred_price.collection.apply(lambda x: clean_name(x))
+	pred_price['token_id'] = pred_price.token_id.astype(str)
 	pred_price = pred_price.merge(listings)

 	coefsdf = pd.read_csv('./data/coefsdf.csv')
@ -338,7 +346,10 @@ def scrape_listings(collections = [ 'aurory','thugbirdz','smb','degenapes','pesk

 	metadata = pd.read_csv('./data/metadata.csv')
 	solana_blob = metadata[ (metadata.collection == 'aurory') & (metadata.feature_name == 'skin') & (metadata.feature_value == 'Solana Blob (9.72%)')].token_id.unique()
-	pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 8 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )
+	pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 4 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )
+
+	solana_blob = metadata[ (metadata.collection == 'aurory') & (metadata.feature_name == 'hair') & (metadata.feature_value == 'Long Blob Hair (9.72%)')].token_id.unique()
+	pred_price['pred_price'] = pred_price.apply(lambda x: (x['pred_price'] * 0.8) - 2 if x['token_id'] in solana_blob and x['collection'] == 'Aurory' else x['pred_price'], 1 )

 	pred_price['abs_chg'] = (pred_price.floor - pred_price.floor_price) * pred_price.lin_coef
 	pred_price['pct_chg'] = (pred_price.floor - pred_price.floor_price) * pred_price.log_coef
@ -711,6 +722,7 @@ def scratch():
 # 	print('Sleeping until {}'.format(sleep_to))
 # 	sleep(60 * 15)
 alerted = []
+scrape_randomearth()
 alerted = scrape_listings(alerted = alerted)
-# scrape_randomearth()
+# alerted = scrape_listings(['smb'],alerted = alerted)
 convert_collection_names()
--- a/solana_model.py
+++ b/solana_model.py
@ -10,7 +10,7 @@ import tensorflow as tf
 import snowflake.connector
 from datetime import datetime
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.linear_model import LinearRegression, RidgeCV
+from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
 from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

 warnings.filterwarnings('ignore')
@ -18,26 +18,45 @@ warnings.filterwarnings('ignore')
 os.chdir('/Users/kellenblumberg/git/nft-deal-score')

 CHECK_EXCLUDE = False
-CHECK_EXCLUDE = True
+# CHECK_EXCLUDE = True

 # Using sales from howrare.is - the last sale that was under 300 was when the floor was at 72. Filtering for when the floor is >100, the lowest sale was 400

 ###################################
 #     Define Helper Functions     #
 ###################################
-def standardize_df(df, cols, usedf=None):
+def standardize_df(df, cols, usedf=None, verbose=False):
    for c in cols:
        if type(usedf) != type(pd.DataFrame()):
            usedf = df
        mu = usedf[c].mean()
        sd = usedf[c].std()
-        # print(c)
+        if verbose:
+            print(c)
        if len(df[c].unique()) == 2 and df[c].max() == 1 and df[c].min() == 0:
            df['std_{}'.format(c)] = df[c].apply(lambda x: (x*2) - 1 )
        else:
            df['std_{}'.format(c)] = (df[c] - mu) / sd
    return(df)

+def merge(left, right, on=None, how='inner', ensure=True, verbose=True):
+    df = left.merge(right, on=on, how=how)
+    if len(df) != len(left) and (ensure or verbose):
+        print('{} -> {}'.format(len(left), len(df)))
+        cur = left.merge(right, on=on, how='left')
+        cols = set(right.columns).difference(set(left.columns))
+        print(cols)
+        col = list(cols)[0]
+        missing = cur[cur[col].isnull()]
+        print(missing.head())
+        if ensure:
+            assert(False)
+    return(df)
+
+def just_float(x):
+    x = re.sub('[^\d\.]', '', str(x))
+    return(float(x))
+
 def calculate_percentages(df, cols=[]):
    add_pct = not 'pct' in df.columns
    if not len(cols):
@ -64,13 +83,9 @@ exclude = [
    # ( 'aurory', 3323, 138 )
 ]
 s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
-s_df[ s_df.collection == 'Levana Dragons' ].sort_values('block_timestamp', ascending=0).head()
-print(len(s_df[s_df.collection == 'Levana Dragon Eggs']))
-print(s_df.groupby('collection').token_id.count())
-s_df.collection.unique()
 s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
-s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
 s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
+s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
 for e in exclude:
    s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
 s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
@ -82,241 +97,183 @@ if not CHECK_EXCLUDE:
    s_df = s_df[s_df.exclude.isnull()]
    del s_df['exclude']

+#########################
+#     Load Metadata     #
+#########################
 m_df = pd.read_csv('./data/metadata.csv')
 m_df['token_id'] = m_df.token_id.astype(str)
-tmp = m_df[m_df.collection.isin(['Levana Dragon Eggs','Levana Meteors','Levana Dust'])]
-tmp['tmp'] = tmp.token_id.astype(int)
-tmp.groupby('collection').tmp.max()
-m_df.head()
-# s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d %H:%M:%S') )
+# remove ones that are not actually metadata
+m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
+m_df['feature_value'] = m_df.feature_value.apply(lambda x: re.split("\(", re.sub("\"", "", x))[0] if type(x)==str else x )
+m_df[(m_df.feature_name=='rank') & (m_df.collection == 'Levana Dragon Eggs')]
+
+
+#####################################
+#     Exclude Special LunaBulls     #
+#####################################
+tokens = pd.read_csv('./data/tokens.csv')
+tokens.token_id.unique()
+lunabullsrem = tokens[tokens.clean_token_id>=10000].token_id.unique()
+m_df = m_df[ -((m_df.collection == 'LunaBulls') & (m_df.token_id.isin(lunabullsrem))) ]
+s_df = s_df[ -((s_df.collection == 'LunaBulls') & (s_df.token_id.isin(lunabullsrem))) ]
+
+
+###########################
+#     Calculate Floor     #
+###########################
 s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
 s_df['timestamp'] = s_df.block_timestamp.astype(int)
-# del metadata['price']
-# del metadata['last_sale']
-s_df = s_df.sort_values(['collection','block_timestamp'])
-s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
-s_df = s_df.sort_values(['collection','block_timestamp'])
 s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
-s_df[[ 'block_timestamp','days_ago' ]].drop_duplicates(subset=['days_ago'])

-s_df['av_20'] = s_df.groupby('collection')['mn_20'].rolling(20).mean().reset_index(0,drop=True)
-s_df = s_df.sort_values(['collection','block_timestamp'])
-# s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).median().reset_index(0,drop=True)
-s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
-# s_df[ (-((s_df.price) >= (s_df.md_20 * 0.2))) & (s_df.price.notnull()) & (s_df.collection == 'Levana Dragon Eggs') ]
-
-s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.75) ]
+# lowest price in last 20 sales
+s_df = s_df.sort_values(['collection','block_timestamp'])
+s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
+s_df = s_df.sort_values(['collection','block_timestamp'])
+s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
+
+# exclude sales that are far below the existing floor
+s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.70) ]
+
+# 10%ile of last 20 sales
 s_df = s_df.sort_values(['collection','block_timestamp'])
 s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
 s_df = s_df.sort_values(['collection','block_timestamp'])
-# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).min().reset_index(0,drop=True)
 s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.1).reset_index(0,drop=True)
-s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(21).tail(40)
-s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(20).sort_values('price')
-s_df['tmp'] = s_df.mn_20 / s_df.md_20

-tmp = s_df[s_df.collection=='smb'][['mn_20','block_timestamp']]
-tmp['date'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
-tmp = tmp.groupby('date').mn_20.median().reset_index()
-tmp.to_csv('~/Downloads/tmp.csv', index=False)

-s_df['tmp'] = s_df.price / s_df.mn_20
-s_df[s_df.collection == 'smb'].sort_values('block_timestamp')[['token_id','price','mn_20']]
-s_df[s_df.collection == 'smb'].sort_values('tmp').head(20)[['collection','token_id','price','mn_20','tmp']]
-s_df.groupby('collection').tmp.median()
-s_df.groupby('collection').tmp.mean()
-
-s_df.sort_values('tmp').head()
-s_df['tmp'] = s_df.price / s_df.mn_20
-s_df[['collection','token_id','block_timestamp','price','mn_20','md_20','av_20','tmp']].to_csv('~/Downloads/tmp.csv', index=False)
-s_df.groupby('collection').tmp.median()
-s_df.groupby('collection').tmp.mean()
-s_df.sort_values('tmp', ascending=0).head()
-s_df.head(21)
-m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
-# m_df['feature_value'] = m_df.feature_value.apply(lambda x: x.strip() )
-# m_df.feature_value.unique()
-pred_cols = {}
-metadata = {}
-sales = {}
-collection_features = {}
-m_df[(m_df.collection == 'Galactic Punks') & (m_df.feature_name == 'pct')].sort_values('token_id')
-c = 'Galactic Punks'
-EXCLUDE_COLS = {
-    'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
-}
-for c in s_df.collection.unique():
-    print('Building {} model'.format(c))
-    sales[c] = s_df[ s_df.collection == c ]
-    exclude = EXCLUDE_COLS[c] if c in EXCLUDE_COLS.keys() else []
-    pred_cols[c] = sorted([x for x in m_df[ m_df.collection == c ].feature_name.unique() if not x in exclude])
-    collection_features[c] = [ c for c in pred_cols[c] if not c in ['score','rank','pct']+exclude ]
-    metadata[c] = m_df[ (m_df.collection == c) & (-(m_df.feature_name.isin(exclude))) ]
-
-    # tmp = pd.pivot_table( metadata[c], ['collection','token_id'], columns=['feature_name'], values=['feature_value'] )
-    metadata[c] = metadata[c].pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
-    metadata[c].columns = [ 'collection','token_id' ] + pred_cols[c]
-
-    features = collection_features[c]
-    cur = metadata[c]
-    cur = cur.dropna(subset=features)
-    for f in features:
-        if type(cur[f].values[0] == str):
-            cur[f] = cur[f].apply(lambda x: re.sub("\"", "", str(x) ) )
-            cur[f] = cur[f].apply(lambda x: re.split("\(", x )[0].strip())
-    cur = cur.replace('', 'Default')
-    # if not 'pct' in cur.columns:
-    cur = calculate_percentages( cur, features )
-    dummies = pd.get_dummies(cur[features])
-    feature_cols = dummies.columns
-    cur = pd.concat([ cur.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
-    metadata[c] = cur
-    # pred_cols[c] = ['rank','score','timestamp','mn_20','log_mn_20'] + list(dummies.columns)
-    cols = [ 'collection_rank' ]
-    cols = [ ]
-    pred_cols[c] = [ 'rank','transform_rank','score'] + [x for x in cols if x in m_df.feature_name.unique()] + list(dummies.columns)
-
-# collection_features = {
-#     'Hashmasks': [ 'character','eyecolor','item','mask','skincolor' ]
-#     , 'Galactic Punks': [ 'backgrounds','hair','species','suits','jewelry','headware','glasses' ]
-#     , 'Solana Monkey Business': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
-#     , 'Aurory': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
-#     # , 'Thugbirdz': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
-# }
-
-excludedf = pd.DataFrame()
+###########################
+#     Calculate Floor     #
+###########################
 coefsdf = pd.DataFrame()
 salesdf = pd.DataFrame()
 attributes = pd.DataFrame()
 pred_price = pd.DataFrame()
 feature_values = pd.DataFrame()
-collections = sorted(metadata.keys())
-collection = 'Galactic Punks'
-tokens = pd.read_csv('./data/tokens.csv')
-collection = 'Levana Dragon Eggs'
-# for collection in s_df.collection.unique():
-for collection in ['Levana Dragon Eggs']:
-    # collection = 'LunaBulls'
-    # collection = 'smb'
-    # collection = 'aurory'
-    # collection = 'meerkatmillionaires'
+# non-binary in model: collection_rank, temperature, weight
+# non-binary in model; exclude from rarity: pct, rank, score
+# exclude from model: lucky_number, shower
+# exclude from model and rarity %: meteor_id, attribute_count, cracking_date
+ALL_NUMERIC_COLS = ['rank','score','pct']
+MODEL_EXCLUDE_COLS = {
+    # 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
+    'Levana Dragon Eggs': ['meteor_id','shower','lucky_number','cracking_date','attribute_count']
+}
+RARITY_EXCLUDE_COLS = {
+    # 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
+    'Levana Dragon Eggs': ['meteor_id','attribute_count','collection_rank','transformed_collection_rank','rarity_score']
+}
+NUMERIC_COLS = {
+    'Levana Dragon Eggs': ['collection_rank','weight','temperature','transformed_collection_rank','rarity_score']
+}
+ATT_EXCLUDE_COLS = {
+    'Levana Dragon Eggs': ['attribute_count','transformed_collection_rank']
+}
+# for collection in [ 'Levana Dragon Eggs' ]:
+for collection in s_df.collection.unique():
    print('Working on collection {}'.format(collection))
-    p_metadata = metadata[collection]
-    if 'attribute_count' in p_metadata.columns:
-        p_metadata['attribute_count'] = p_metadata.attribute_count.astype(float).astype(int)
-    
-    p_sales = sales[collection]
-    # specify the predictive features
-    p_pred_cols = pred_cols[collection]
-    p_features = collection_features[collection]
-    p_sales['token_id'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
-    p_metadata['token_id'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
-    for c in [ 'rank','score' ]:
-        p_metadata[c] = p_metadata[c].astype(float)
-    # p_sales['contract_address'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
-    # p_metadata['contract_address'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
-    p_sales['contract_address'] = ''
-    p_metadata['contract_address'] = ''
+    sales = s_df[ s_df.collection == collection ]
+    metadata = m_df[ m_df.collection == collection ]
+    metadata[metadata.token_id == '1']
+    metadata[metadata.feature_name == 'rank']
+    metadata.feature_name.unique()

-    # remove 1 columns for each group (since they are colinear)
-    # exclude = []
-    # for f in p_features:
-    #     e = [ c for c in p_pred_cols if c[:len(f)] == f ][-1]
-    #     exclude.append(e)
+    # categorize columns
+    all_names = sorted(metadata.feature_name.unique())
+    model_exclude = MODEL_EXCLUDE_COLS[collection] if collection in MODEL_EXCLUDE_COLS.keys() else []
+    num_features = sorted((NUMERIC_COLS[collection] if collection in NUMERIC_COLS.keys() else []) + ALL_NUMERIC_COLS)
+    num_features = [ x for x in num_features if x in metadata.feature_name.unique() ]
+    num_metadata = metadata[metadata.feature_name.isin(num_features)]
+    num_metadata[num_metadata.feature_name == 'rank']
+    cat_features = sorted([ x for x in all_names if not x in (model_exclude + num_features) ])
+    cat_metadata = metadata[metadata.feature_name.isin(cat_features)]

-    df = p_sales.merge(p_metadata, on=['token_id','contract_address'])
-    df = df[df.mn_20.notnull()]
+    # create dummies for binary variables
+    num_metadata = num_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
+    num_metadata.columns = [ 'collection','token_id' ] + num_features
+
+    # create dummies for binary variables
+    cat_metadata = cat_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
+    cat_metadata.columns = [ 'collection','token_id' ] + cat_features
+    cat_metadata = calculate_percentages( cat_metadata, cat_features )
+    dummies = pd.get_dummies(cat_metadata[cat_features])
+    cat_metadata = pd.concat([ cat_metadata.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
+    del cat_metadata['pct']
+
+    pred_cols = num_features + list(dummies.columns)
+
+    # create training df
+    df = merge(sales, num_metadata, ['collection','token_id'], ensure=False)
+    df = merge(df, cat_metadata, ['collection','token_id'])
+    for c in num_features:
+        df[c] = df[c].apply(lambda x: just_float(x))
+
+    # create target cols
    target_col = 'adj_price'
    df[target_col] = df.apply(lambda x: max(0.7 * (x['mn_20'] - 0.2), x['price']), 1 )
-    # df['mn_20'] = df.apply(lambda x: min(x[target_col], x['mn_20']), 1 )
-    # tmp = df[['block_timestamp','mn_20']].copy()
-    # tmp['tmp'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
-    # tmp = tmp.groupby('tmp').mn_20.median().reset_index()
-    # tmp.sort_values('tmp').to_csv('~/Downloads/tmp.csv', index=False)
-    # df['timestamp'] = df.block_timestamp.astype(int)
    df = df[df[target_col].notnull()]
-    df = df.reset_index(drop=True)
-    df['transform_rank'] = df['rank'].apply(lambda x: 1.0 / (x**2) )
+    df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
    df['rel_price_0'] = df[target_col] - df.mn_20
    df['rel_price_1'] = df[target_col] / df.mn_20
    df = df[df.mn_20 > 0]
    df['log_mn_20'] = np.log(df.mn_20)
    print('Training on {} sales'.format(len(df)))
-    # df['price_median'] = df.groupby('token_id').price.median()
+    df = standardize_df(df, pred_cols)

-    # standardize columns to mean 0 sd 1
-    len(p_pred_cols)
-    df = standardize_df(df, p_pred_cols)
-    std_pred_cols_0 = [ 'std_{}'.format(c) for c in p_pred_cols ]
-    # p_pred_cols = [ c for c in p_pred_cols if not c in exclude ]
-    std_pred_cols = [ 'std_{}'.format(c) for c in p_pred_cols ]
-    df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
-    # df.sort_values('block_timestamp').head(10)[['price','tx_id']]
-    # df.sort_values('block_timestamp').head(10)[['price','tx_id']].tx_id.values
-    # df = df[df.price >= 1]
+    std_pred_cols_0 = [ 'std_{}'.format(c) for c in pred_cols ]
+    std_pred_cols = [ 'std_{}'.format(c) for c in pred_cols ]

    #########################
    #     Run the Model     #
    #########################
-    len(df)
-    len(df.dropna(subset=std_pred_cols))
    tmp = df[std_pred_cols].count().reset_index()
    tmp.columns = ['a','b']
    tmp.sort_values('b').head(20)
    rem = list(tmp[tmp.b==0].a.values)
    std_pred_cols = [ c for c in std_pred_cols if not c in rem ]
+    # if collection == 'Levana Dragon Eggs':
+    #     std_pred_cols = [ 'std_genus_Titan','std_score','std_weight','std_transformed_collection_rank','std_collection_rank','std_legendary_composition_None','std_ancient_composition_None' ]
    mn = df.timestamp.min()
    mx = df.timestamp.max()
-    df['weight'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
+    df['wt'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
    X = df[std_pred_cols].values
-    mu = df.log_price.mean()
-    sd = df.log_price.std()
-    df['std_log_price'] = (df.log_price - mu) / sd
-    # y = df.std_log_price.values
-    # y = df[target_col].values
-    # y = df.rel_price_1.values
    y_0 = df.rel_price_0.values
    y_1 = df.rel_price_1.values
-    # y_log = df.log_price.values

-    clf_lin = RidgeCV(alphas=[1.5**x for x in range(20)])
-    clf_lin.fit(X, y_0, df.weight.values)
+
+    # run the linear model
+    clf_lin = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
+    # clf_lin = RidgeCV(alphas=[1.5**x for x in range(20)])
+    clf_lin.fit(X, y_0, df.wt.values)
+    if collection == 'Levana Dragon Eggs':
+        coefs = []
+        for a, b in zip(std_pred_cols, clf_lin.coef_):
+            coefs += [[a,b]]
+        coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
+        coefs.to_csv('~/Downloads/levana_lin_coefs.csv', index=False)
    df['pred_lin'] = clf_lin.predict(X)
    df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
    df['err_lin'] = abs(((df.pred_lin - df[target_col]) / df[target_col]) )
-    # df['err_lin'] = abs(df.pred_lin - df.price )
-    # df[[ 'price','pred_lin','err_lin','mn_20' ]].sort_values('err_lin').tail(50)
-    df.head()
-    clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
-    clf_log.fit(X, y_1, df.weight.values)
+
+    # run the log model
+    clf_log = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
+    # clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
+    clf_log.fit(X, y_1, df.wt.values)
+    if collection == 'Levana Dragon Eggs':
+        coefs = []
+        for a, b in zip(std_pred_cols, clf_lin.coef_):
+            coefs += [[a,b]]
+        coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
+        coefs.to_csv('~/Downloads/levana_log_coefs.csv', index=False)
    df['pred_log'] = clf_log.predict(X)
    df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
    df['err_log'] = abs(((df.pred_log - df[target_col]) / df[target_col]) )
    df[[ target_col,'pred_log','err_log','mn_20' ]].sort_values('err_log').tail(50)
-
    df['err'] = df.err_lin * df.err_log

-    df[[ target_col,'pred_log','err_log','err_lin','err','mn_20' ]].sort_values('err').tail(50)
-    df['collection'] = collection
-    excludedf = excludedf.append(df[df.err > 2][['collection','token_id','price']])
-    # df = df[df.err < 2]

-    print(round(len(df[df.err > 2]) * 100.0 / len(df), 2))
-
-    df[(df.err_log > 1) & (df.err_lin >= 5)]
-
-    clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
-    clf_log.fit(X, y_1, df.weight.values)
-
-    clf_log = RidgeCV(alphas=[1.5**x for x in range(20)])
-    clf_log.fit(X, y_1, df.weight.values)
-    df['pred_lin'] = clf_lin.predict(X)
-    df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
-    # df['pred_log'] = np.exp(clf_log.predict(X))
-    df['pred_log'] = clf_log.predict(X)
-    df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
+    # combine the models
    clf = LinearRegression(fit_intercept=False)
-    clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.weight.values )
+    clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.wt.values )
    print('Price = {} * lin + {} * log'.format( round(clf.coef_[0], 2), round(clf.coef_[1], 2) ))
    l = df.sort_values('block_timestamp', ascending=0).mn_20.values[0]
    tmp = pd.DataFrame([[collection, clf.coef_[0], clf.coef_[1], l]], columns=['collection','lin_coef','log_coef','floor_price'])
@ -335,34 +292,8 @@ for collection in ['Levana Dragon Eggs']:
        df['pred'] = clf.predict( df[['pred_lin','pred_log']].values )
    coefsdf = coefsdf.append(tmp)
    df['err'] = (df.pred / df[target_col]).apply(lambda x: abs(x-1) )
-    df[df.block_timestamp>='2021-10-01'].sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'token_id','block_timestamp','err','mn_20' ]]
-    # df[df.block_timestamp>='2021-10-01'].err.mean()
-    df.merge(tokens[['collection','token_id','clean_token_id']]).sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'clean_token_id','rank','block_timestamp','err','mn_20','tx_id' ]]
-    df.sort_values('price', ascending=0).head(20)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
-    df.sort_values('price', ascending=0).tail(40)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
-    df.sort_values('price', ascending=0).head(20).tx_id.values

-    # print(np.mean(y))
-    # print(np.mean(clf.predict(X)))
-
-    # # run neural net
-    # model = tf.keras.models.Sequential([
-    #     tf.keras.layers.Dense(9, activation='relu')
-    #     , tf.keras.layers.Dropout(.2)
-    #     , tf.keras.layers.Dense(3, activation='relu')
-    #     , tf.keras.layers.Dropout(.2)
-    #     , tf.keras.layers.Dense(1, activation='linear')
-    # ])
-    # model.compile(loss='mae', optimizer=tf.keras.optimizers.SGD(learning_rate=0.0025))
-    # model.fit(X, y, epochs=500, validation_split=0.3)
-
-    # df['pred'] = np.exp( (sd * model.predict(df[std_pred_cols].values)) + mu)
-    # df['pred'] = model.predict(df[std_pred_cols].values)
-    # ratio = df.price.mean() / df.pred.mean()
-    # print("Manually increasing predictions by {}%".format(round((ratio-1) * 100, 1)))
-
-    # checking errors
-    # df['pred'] = df.pred * ratio
+    # print out some summary stats
    df['err'] = df[target_col] - df.pred
    df['q'] = df.pred.rank() * 10 / len(df)
    df['q'] = df.q.apply(lambda x: int(round(x)) )
@ -373,137 +304,56 @@ for collection in ['Levana Dragon Eggs']:
    df['pred_price'] = df.pred#.apply(lambda x: x*(1+pe_mu) )
    df['pred_sd'] = df.pred * pe_sd
    print(df.groupby('q')[['err','pred',target_col]].mean())
-    print(df[df.weight >= df.weight.median()].groupby('q')[['err','pred',target_col]].mean())
+    print(df[df.wt >= df.wt.median()].groupby('q')[['err','pred',target_col]].mean())
    # df.err.mean()
    # df[df.weight >= 3.5].err.mean()
    df['collection'] = collection
    print('Avg err last 100: {}'.format(round(df.sort_values('block_timestamp').head(100).err.mean(), 2)))
-    salesdf = salesdf.append( df[[ 'collection','contract_address','token_id','block_timestamp','price','pred','mn_20','rank','score' ]].sort_values('block_timestamp', ascending=0) )
+    salesdf = salesdf.append( df[[ 'collection','token_id','block_timestamp','price','pred','mn_20','rank' ]].sort_values('block_timestamp', ascending=0) )

-    # create the attributes dataframe
-    for f in p_features:
-        cur = p_metadata[[ 'token_id', f, '{}_pct'.format(f) ]]
-        cur.columns = [ 'token_id', 'value','rarity' ]
-        cur['feature'] = f
-        cur['collection'] = collection
-        attributes = attributes.append(cur)

-    # create predictions for each NFT in the collection
-    test = p_metadata.copy()
+    ############################################################
+    #     Create Predictions for Each NFT in The Collection    #
+    ############################################################
+    test = merge(num_metadata, cat_metadata, ['collection','token_id'])
+    for c in num_features:
+        test[c] = test[c].apply(lambda x: just_float(x) )
    tail = df.sort_values('timestamp').tail(1)
    for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
        if c in tail.columns:
            test[c] = tail[c].values[0]
-    test = standardize_df(test, [c for c in p_pred_cols if not c in ['timestamp'] ], df)
-    # test['pred_lin'] = clf_lin.predict( test[std_pred_cols].values )
-    # test['pred_log'] = np.exp(clf_log.predict( test[std_pred_cols].values ))
+    test = standardize_df(test, pred_cols, df)

    test['pred_lin'] = clf_lin.predict(test[std_pred_cols].values)
    test['pred_lin'] = test.pred_lin.apply(lambda x: max(0, x) + l)
-    # test['pred_lin'] = df.pred_lin + df.mn_20
-    # df['pred_log'] = np.exp(clf_log.predict(X))
    test['pred_log'] = clf_log.predict(test[std_pred_cols].values)
    test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * l

-    test['pred'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
-    # test['pred'] = np.exp( (sd * model.predict(test[std_pred_cols].values)) + mu) * ratio
-    test['pred_price'] = test.pred#.apply(lambda x: x*(1+pe_mu) )
+    test['pred_price'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
    if not CHECK_EXCLUDE:
-        test['pred_price'] = test.pred.apply(lambda x: (x*0.985) )
-    test['pred_sd'] = test.pred * pe_sd
-    test['rk'] = test.pred.rank(ascending=0, method='first')
+        test['pred_price'] = test.pred_price.apply(lambda x: (x*0.985) )
+    test['pred_sd'] = test.pred_price * pe_sd
+    test['rk'] = test.pred_price.rank(ascending=0, method='first')
    test['collection'] = collection
-    pred_price = pred_price.append( test[[ 'collection', 'contract_address','token_id','rank','rk','pred_price','pred_sd' ] + p_features].rename(columns={'rank':'hri_rank'}).sort_values('pred_price') )
-    # print(test[[ 'contract_address','token_id','pred_price','pred_sd' ]].sort_values('pred_price'))
+    pred_price = pred_price.append( test[[ 'collection','token_id','rank','rk','pred_price','pred_sd' ]].sort_values('pred_price') )

+    cols = metadata.feature_name.unique()
+    cols = [ x for x in cols if not x in (ATT_EXCLUDE_COLS[collection] if collection in ATT_EXCLUDE_COLS.keys() else []) + ALL_NUMERIC_COLS ]
+    exclude = RARITY_EXCLUDE_COLS[collection] if collection in RARITY_EXCLUDE_COLS.keys() else []
+    for c in cols:
+        cur = metadata[metadata.feature_name == c][['collection','token_id','feature_name','feature_value']]
+        if c in exclude:
+            cur['rarity'] = None
+        else:
+            g = cur.groupby('feature_value').token_id.count().reset_index()
+            g['rarity'] = g.token_id / len(cur.token_id.unique())
+            cur = merge(cur, g[['feature_value','rarity']])
+        attributes = attributes.append(cur)

-    ##############################
-    #     Feature Importance     #
-    ##############################
-    coefs = []
-    for a, b, c in zip(p_pred_cols, clf_lin.coef_, clf_log.coef_):
-        coefs += [[ collection, a, b, c ]]
-    coefs = pd.DataFrame(coefs, columns=['collection','col','lin_coef','log_coef'])
-    # coefs['feature'] = coefs.col.apply(lambda x: ' '.join(re.split('_', x)[:-1]).title() )
-    # coefs['feature'] = coefs.col.apply(lambda x: '_'.join(re.split('_', x)[:-1]) )
-    # coefs['value'] = coefs.col.apply(lambda x: re.split('_', x)[-1] )
-    # mn = coefs.groupby('feature')[[ 'lin_coef','log_coef' ]].min().reset_index()
-    # mn.columns = [ 'feature','mn_lin_coef','mn_log_coef' ]
-    # coefs = coefs.merge(mn)
-    # coefs['lin_coef'] = coefs.lin_coef - coefs.mn_lin_coef
-    # coefs['log_coef'] = coefs.log_coef - coefs.mn_log_coef
-    # coefs
-    # g = attributes[ attributes.collection == collection ][[ 'feature','value','rarity' ]].drop_duplicates()
-    # g['value'] = g.value.astype(str)
-    # len(coefs)
-    # g = coefs.merge(g, how='left')
-    # g[g.rarity.isnull()]
-    # len(g)
-    # coefs = coefs.merge( m_df[ m_df.collection == collection ][[ 'feature_name','' ]] )
-    # coefs.sort_values('lin_coef').tail(20)
-
-    # TODO: pick the most common one and have that be the baseline
-    most_common = attributes[(attributes.collection == collection)].sort_values('rarity', ascending=0).groupby('feature').head(1)
-    most_common['col'] = most_common.apply(lambda x: 'std_{}_{}'.format( re.sub(' ', '_', x['feature'].lower()), x['value'] ), 1 )
-    mc = most_common.col.unique()
-    data = []
-    for c0 in std_pred_cols_0:
-        if c0 in ['std_rank','std_score','std_pct','std_timestamp','std_mn_20','std_log_mn_20']:
-            continue
-        f = '_'.join(re.split('_', c0)[1:-1])
-        v = re.split('_', c0)[-1]
-        rarity = p_metadata[p_metadata['{}_{}'.format(f, v)]==1]['{}_pct'.format(f)].values[0]
-        # avg = p_metadata['{}_pct'.format(f)].mean()
-        # avg_pct = df.pct.mean()
-        # pct_std = ((avg_pct * r / avg) - avg_pct) / df.pct.std()
-        r = df[df['{}_{}'.format(f, v)]==1].std_rank.mean()
-        s = df[df['{}_{}'.format(f, v)]==1].std_score.mean()
-        if r == r and s == s:
-            datum = [ c0, rarity ]
-            for c1 in std_pred_cols:
-                datum.append(1 if c1 == c0 else r if c1 == 'std_rank' else s if c1 == 'std_score' else 1 if c1 in mc else 0 )
-            data += [ datum ]
-
-    importance = pd.DataFrame(data, columns=['feature','rarity']+std_pred_cols)
-    sorted(importance.feature.unique())
-    importance[importance.feature == 'std_fur_/_skin_Leopard']
-    if 'std_timestamp' in df.columns:
-        importance['std_timestamp'] = df.std_timestamp.max()
-    # importance['pred_lin'] = clf_lin.predict( importance[std_pred_cols].values )
-    # importance['pred_log'] = np.exp(clf_log.predict( importance[std_pred_cols].values ))
-
-    importance['pred_lin'] = clf_lin.predict(importance[std_pred_cols].values)
-    importance['pred_lin'] = importance.pred_lin.apply(lambda x: max(0, x) + l)
-    # importance['pred_lin'] = importance.pred_lin.apply(lambda x: x + l)
-    importance['pred_log'] = clf_log.predict(importance[std_pred_cols].values)
-    importance['pred_log'] = importance.pred_log.apply(lambda x: max(1, x)) * l
-    # importance['pred_log'] = importance.pred_log.apply(lambda x: x) * l
-
-    importance['pred'] = clf.predict( importance[[ 'pred_lin','pred_log' ]].values )
-    # importance['pred'] = np.exp( (sd * model.predict(importance[std_pred_cols].values)) + mu)
-    importance = importance.sort_values('pred', ascending=0)
-    importance.head()[['feature','pred']]
-    importance[importance.feature == 'std_fur_/_skin_Leopard']
-    importance['feature'] = importance.feature.apply(lambda x: re.sub('std_', '', x))
-    importance['value'] = importance.feature.apply(lambda x: re.split('_', x)[-1])
-    importance['feature'] = importance.feature.apply(lambda x: '_'.join(re.split('_', x)[:-1]))
-    mn = importance.groupby('feature').pred.min().reset_index().rename(columns={'pred':'baseline'})
-    importance = importance.merge(mn)
-    importance['pred_vs_baseline'] = importance.pred - importance.baseline
-    importance['pct_vs_baseline'] = (importance.pred / importance.baseline) - 1
-    importance[(importance.feature == 'fur_/_skin')].sort_values('pred')[['value','rarity','pred','pred_lin','pred_log','std_rank','std_score']].sort_values('rarity')
-    importance['collection'] = collection
-    importance.sort_values('pct_vs_baseline')[['feature','value','pct_vs_baseline']]
-    tmp = importance[std_pred_cols].mean().reset_index()
-    tmp.columns = [ 'a', 'b' ]
-    tmp = tmp.sort_values('b')
-    feature_values = feature_values.append(importance[['collection','feature','value','pred','pred_vs_baseline','pct_vs_baseline','rarity']])
-
-attributes['feature'] = attributes.feature.apply(lambda x: re.sub('_', ' ', x).title() )
-feature_values['feature'] = feature_values.feature.apply(lambda x: re.sub('_', ' ', x).title() )
-
-pred_price = pred_price[[ 'collection', 'contract_address', 'token_id', 'hri_rank', 'rk', 'pred_price', 'pred_sd' ]]
-
+attributes['feature_name'] = attributes.feature_name.apply(lambda x: re.sub('_', ' ', x).title() )
+sorted(attributes['feature_name'].unique())
+if len(feature_values):
+    feature_values['feature_name'] = feature_values.feature_name.apply(lambda x: re.sub('_', ' ', x).title() )

 coefsdf.to_csv('./data/coefsdf.csv', index=False)
 salesdf.to_csv('./data/model_sales.csv', index=False)
@ -511,24 +361,6 @@ pred_price.to_csv('./data/pred_price.csv', index=False)
 attributes.to_csv('./data/attributes.csv', index=False)
 feature_values.to_csv('./data/feature_values.csv', index=False)

-pred_price = pd.read_csv('./data/pred_price.csv')
-tokens = pd.read_csv('./data/tokens.csv')
-rem = tokens[tokens.clean_token_id>=10000].token_id.unique()
-l0 = len(pred_price)
-pred_price = pred_price[ -((pred_price.collection == 'LunaBulls') & (pred_price.token_id.isin(rem))) ]
-l1 = len(pred_price)
-pred_price.to_csv('./data/pred_price.csv', index=False)
-
-# excludedf.to_csv('./data/excludedf.csv', index=False)
-# listings = pd.read_csv('./data/listings.csv')
-# listings['token_id'] = listings.token_id.astype(int)
-
-# tmp = salesdf.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
-# tmp = pred_price.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
-# tmp['token_id'] = tmp.token_id.astype(int)
-# tmp = tmp.merge(listings[['collection','token_id','price']])
-# tmp.sort_values('pred_price', ascending=0)
-
 if CHECK_EXCLUDE:
    salesdf['rat'] = salesdf.price / salesdf.pred
    salesdf['dff'] = salesdf.price - salesdf.pred
@ -542,9 +374,3 @@ if CHECK_EXCLUDE:
    print(salesdf.exclude.mean())
    salesdf[salesdf.token_id == '2239'][['collection','price','exclude']]
    salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]].to_csv('./data/exclude.csv', index=False)
-
-attributes[ (attributes.collection == 'thugbirdz') & (attributes.token_id == '1869') ]
-feature_values[ (feature_values.collection == 'thugbirdz') & (feature_values.feature == 'position_in_gang') ]
-sorted(feature_values[ (feature_values.collection == 'thugbirdz') ].feature.unique())
-
-pred_price[pred_price.collection == 'peskypenguinclub'].head()
--- a/solana_model_old.py
+++ b/solana_model_old.py
@ -0,0 +1,574 @@
+import os
+import re
+import json
+import warnings
+import requests
+import numpy as np
+import pandas as pd
+import urllib.request
+import tensorflow as tf
+import snowflake.connector
+from datetime import datetime
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
+from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
+
+warnings.filterwarnings('ignore')
+
+os.chdir('/Users/kellenblumberg/git/nft-deal-score')
+
+CHECK_EXCLUDE = False
+CHECK_EXCLUDE = True
+
+# Using sales from howrare.is - the last sale that was under 300 was when the floor was at 72. Filtering for when the floor is >100, the lowest sale was 400
+
+###################################
+#     Define Helper Functions     #
+###################################
+def standardize_df(df, cols, usedf=None, verbose=False):
+    for c in cols:
+        if type(usedf) != type(pd.DataFrame()):
+            usedf = df
+        mu = usedf[c].mean()
+        sd = usedf[c].std()
+        if verbose:
+            print(c)
+        if len(df[c].unique()) == 2 and df[c].max() == 1 and df[c].min() == 0:
+            df['std_{}'.format(c)] = df[c].apply(lambda x: (x*2) - 1 )
+        else:
+            df['std_{}'.format(c)] = (df[c] - mu) / sd
+    return(df)
+
+def just_float(x):
+    x = re.sub('[^\d\.]', '', str(x))
+    return(float(x))
+
+def calculate_percentages(df, cols=[]):
+    add_pct = not 'pct' in df.columns
+    if not len(cols):
+        cols = df.columns
+    if add_pct:
+        df['pct'] = 1
+    for c in cols:
+        g = df[c].value_counts().reset_index()
+        g.columns = [ c, 'N' ]
+        col = '{}_pct'.format(c)
+        g[col] = g.N / g.N.sum()
+        df = df.merge( g[[ c, col ]] )
+        if add_pct:
+            df['pct'] = df.pct * df[col]
+    return(df)
+
+exclude = [
+    # (collection, token_id, price)
+    ( 'aurory', 2239, 3500 )
+    # ( 'aurory', 856, 150 )
+    # ( 'aurory', 4715, 500 )
+    # ( 'aurory', 5561, 298 )
+    # ( 'aurory', 5900, 199 )
+    # ( 'aurory', 3323, 138 )
+]
+s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
+s_df[ s_df.collection == 'Levana Dragons' ].sort_values('block_timestamp', ascending=0).head()
+print(len(s_df[s_df.collection == 'Levana Dragon Eggs']))
+print(s_df.groupby('collection').token_id.count())
+s_df.collection.unique()
+s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
+s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
+s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
+for e in exclude:
+    s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
+s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
+
+# exclude wierd data points
+if not CHECK_EXCLUDE:
+    exclude = pd.read_csv('./data/exclude.csv')
+    s_df = s_df.merge(exclude, how='left')
+    s_df = s_df[s_df.exclude.isnull()]
+    del s_df['exclude']
+
+m_df = pd.read_csv('./data/metadata.csv')
+m_df['token_id'] = m_df.token_id.astype(str)
+tmp = m_df[m_df.collection.isin(['Levana Dragon Eggs','Levana Meteors','Levana Dust'])]
+tmp['tmp'] = tmp.token_id.astype(int)
+tmp.groupby('collection').tmp.max()
+m_df.head()
+# s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d %H:%M:%S') )
+s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
+s_df['timestamp'] = s_df.block_timestamp.astype(int)
+# del metadata['price']
+# del metadata['last_sale']
+s_df = s_df.sort_values(['collection','block_timestamp'])
+s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
+s_df = s_df.sort_values(['collection','block_timestamp'])
+s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
+s_df[[ 'block_timestamp','days_ago' ]].drop_duplicates(subset=['days_ago'])
+
+s_df['av_20'] = s_df.groupby('collection')['mn_20'].rolling(20).mean().reset_index(0,drop=True)
+s_df = s_df.sort_values(['collection','block_timestamp'])
+# s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).median().reset_index(0,drop=True)
+s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
+# s_df[ (-((s_df.price) >= (s_df.md_20 * 0.2))) & (s_df.price.notnull()) & (s_df.collection == 'Levana Dragon Eggs') ]
+
+s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.75) ]
+s_df = s_df.sort_values(['collection','block_timestamp'])
+s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
+s_df = s_df.sort_values(['collection','block_timestamp'])
+# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).min().reset_index(0,drop=True)
+s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.1).reset_index(0,drop=True)
+s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(21).tail(40)
+s_df.sort_values(['collection','block_timestamp'])[['price','mn_20','block_timestamp']].head(20).sort_values('price')
+s_df['tmp'] = s_df.mn_20 / s_df.md_20
+
+tmp = s_df[s_df.collection=='smb'][['mn_20','block_timestamp']]
+tmp['date'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
+tmp = tmp.groupby('date').mn_20.median().reset_index()
+tmp.to_csv('~/Downloads/tmp.csv', index=False)
+
+s_df['tmp'] = s_df.price / s_df.mn_20
+s_df[s_df.collection == 'smb'].sort_values('block_timestamp')[['token_id','price','mn_20']]
+s_df[s_df.collection == 'smb'].sort_values('tmp').head(20)[['collection','token_id','price','mn_20','tmp']]
+s_df.groupby('collection').tmp.median()
+s_df.groupby('collection').tmp.mean()
+
+s_df.sort_values('tmp').head()
+s_df['tmp'] = s_df.price / s_df.mn_20
+s_df[['collection','token_id','block_timestamp','price','mn_20','md_20','av_20','tmp']].to_csv('~/Downloads/tmp.csv', index=False)
+s_df.groupby('collection').tmp.median()
+s_df.groupby('collection').tmp.mean()
+s_df.sort_values('tmp', ascending=0).head()
+s_df.head(21)
+m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
+# m_df['feature_value'] = m_df.feature_value.apply(lambda x: x.strip() )
+# m_df.feature_value.unique()
+pred_cols = {}
+metadata = {}
+sales = {}
+collection_features = {}
+m_df[(m_df.collection == 'Galactic Punks') & (m_df.feature_name == 'pct')].sort_values('token_id')
+c = 'Levana Dragon Eggs'
+# pred_cols[c]
+EXCLUDE_COLS = {
+    # 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
+    'Levana Dragon Eggs': ['meteor_id','shower','lucky_number','cracking_date','attribute_count']
+}
+NUMERIC_COLS = {
+    'Levana Dragon Eggs': ['rank','score','pct','collection_rank','weight','temperature']
+}
+for c in s_df.collection.unique():
+    print('Building {} model'.format(c))
+    exclude = EXCLUDE_COLS[c] if c in EXCLUDE_COLS.keys() else []
+    n_cols = NUMERIC_COLS[c] if c in NUMERIC_COLS.keys() else []
+    exclude = [ x for x in exclude if not x in n_cols ]
+    o_cols = sorted([x for x in m_df[ m_df.collection == c ].feature_name.unique() if (not x in exclude) and not (x in n_cols) ])
+
+    sales[c] = s_df[ s_df.collection == c ]
+    pred_cols[c] = sorted( n_cols + o_cols )
+    collection_features[c] = [ c for c in pred_cols[c] if not c in ['score','rank','pct']+exclude ]
+    metadata[c] = m_df[ (m_df.collection == c) & (-(m_df.feature_name.isin(exclude))) ]
+
+    # tmp = pd.pivot_table( metadata[c], ['collection','token_id'], columns=['feature_name'], values=['feature_value'] )
+    metadata[c] = metadata[c].pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
+    metadata[c].columns = [ 'collection','token_id' ] + pred_cols[c]
+
+    features = collection_features[c]
+    cur = metadata[c]
+    # cur = cur.dropna(subset=features)
+    for f in features:
+        if type(cur[f].values[0] == str):
+            cur[f] = cur[f].apply(lambda x: re.sub("\"", "", str(x) ) )
+            cur[f] = cur[f].apply(lambda x: re.split("\(", x )[0].strip())
+    cur = cur.replace('', 'Default')
+    # if not 'pct' in cur.columns:
+    cur = calculate_percentages( cur, o_cols )
+    dummies = pd.get_dummies(cur[o_cols])
+    # feature_cols = dummies.columns
+    cur = pd.concat([ cur.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
+    metadata[c] = cur
+    # pred_cols[c] = ['rank','score','timestamp','mn_20','log_mn_20'] + list(dummies.columns)
+    # cols = [ 'collection_rank' ]
+    # cols = [ ]
+    # pred_cols[c] = [ 'rank','transform_rank','score'] + n_cols + [x for x in cols if x in m_df.feature_name.unique()] + list(dummies.columns)
+    # pred_cols[c] = [ 'rank','transform_rank','score'] + n_cols + list(dummies.columns)
+    pred_cols[c] = n_cols + list(dummies.columns)
+
+# collection_features = {
+#     'Hashmasks': [ 'character','eyecolor','item','mask','skincolor' ]
+#     , 'Galactic Punks': [ 'backgrounds','hair','species','suits','jewelry','headware','glasses' ]
+#     , 'Solana Monkey Business': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
+#     , 'Aurory': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
+#     # , 'Thugbirdz': [ 'attribute_count','type','clothes','ears','mouth','eyes','hat','background' ]
+# }
+
+coefsdf = pd.DataFrame()
+salesdf = pd.DataFrame()
+attributes = pd.DataFrame()
+pred_price = pd.DataFrame()
+feature_values = pd.DataFrame()
+collections = sorted(metadata.keys())
+collection = 'Galactic Punks'
+tokens = pd.read_csv('./data/tokens.csv')
+collection = 'Levana Dragon Eggs'
+# for collection in s_df.collection.unique():
+for collection in ['Levana Dragon Eggs']:
+    # collection = 'LunaBulls'
+    # collection = 'smb'
+    # collection = 'aurory'
+    # collection = 'meerkatmillionaires'
+    print('Working on collection {}'.format(collection))
+    p_metadata = metadata[collection]
+    if 'attribute_count' in p_metadata.columns:
+        p_metadata['attribute_count'] = p_metadata.attribute_count.astype(float).astype(int)
+    
+    p_sales = sales[collection]
+    # specify the predictive features
+    p_pred_cols = pred_cols[collection]
+    if collection == 'Levana Dragon Eggs':
+        p_pred_cols += [ 'transformed_collection_rank' ]
+    p_features = collection_features[collection]
+    p_sales['token_id'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
+    p_metadata['token_id'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
+    for c in [ 'rank','score' ]:
+        p_metadata[c] = p_metadata[c].astype(float)
+    # p_sales['contract_address'] = p_sales.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
+    # p_metadata['contract_address'] = p_metadata.token_id.apply(lambda x: re.sub("\"", "", str(x)) )
+    p_sales['contract_address'] = ''
+    p_metadata['contract_address'] = ''
+
+    # remove 1 columns for each group (since they are colinear)
+    # exclude = []
+    # for f in p_features:
+    #     e = [ c for c in p_pred_cols if c[:len(f)] == f ][-1]
+    #     exclude.append(e)
+
+    df = p_sales.merge(p_metadata, on=['token_id','contract_address'])
+    df = df[df.mn_20.notnull()]
+    target_col = 'adj_price'
+    df[target_col] = df.apply(lambda x: max(0.7 * (x['mn_20'] - 0.2), x['price']), 1 )
+    # df['mn_20'] = df.apply(lambda x: min(x[target_col], x['mn_20']), 1 )
+    # tmp = df[['block_timestamp','mn_20']].copy()
+    # tmp['tmp'] = tmp.block_timestamp.apply(lambda x: str(x)[:10] )
+    # tmp = tmp.groupby('tmp').mn_20.median().reset_index()
+    # tmp.sort_values('tmp').to_csv('~/Downloads/tmp.csv', index=False)
+    # df['timestamp'] = df.block_timestamp.astype(int)
+    df = df[df[target_col].notnull()]
+    df = df.reset_index(drop=True)
+    df['transform_rank'] = df['rank'].apply(lambda x: 1.0 / (x**2) )
+    df['rel_price_0'] = df[target_col] - df.mn_20
+    df['rel_price_1'] = df[target_col] / df.mn_20
+    df = df[df.mn_20 > 0]
+    df['log_mn_20'] = np.log(df.mn_20)
+    print('Training on {} sales'.format(len(df)))
+    # df['price_median'] = df.groupby('token_id').price.median()
+
+    # standardize columns to mean 0 sd 1
+    len(p_pred_cols)
+    n_cols = NUMERIC_COLS[collection] if collection in NUMERIC_COLS.keys() else []
+    for c in n_cols:
+        df[c] = df[c].apply(lambda x: just_float(x) )
+    if collection == 'Levana Dragon Eggs':
+        df['transformed_collection_rank'] = df.collection_rank.apply(lambda x: (1.0/ x)**2 )
+    df = standardize_df(df, p_pred_cols)
+    std_pred_cols_0 = [ 'std_{}'.format(c) for c in p_pred_cols ]
+    # p_pred_cols = [ c for c in p_pred_cols if not c in exclude ]
+    std_pred_cols = [ 'std_{}'.format(c) for c in p_pred_cols ]
+    df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
+    # df.sort_values('block_timestamp').head(10)[['price','tx_id']]
+    # df.sort_values('block_timestamp').head(10)[['price','tx_id']].tx_id.values
+    # df = df[df.price >= 1]
+
+    #########################
+    #     Run the Model     #
+    #########################
+    len(df)
+    len(df.dropna(subset=std_pred_cols))
+    tmp = df[std_pred_cols].count().reset_index()
+    tmp.columns = ['a','b']
+    tmp.sort_values('b').head(20)
+    rem = list(tmp[tmp.b==0].a.values)
+    std_pred_cols = [ c for c in std_pred_cols if not c in rem ]
+    mn = df.timestamp.min()
+    mx = df.timestamp.max()
+    df['weight'] = df.timestamp.apply(lambda x: 2.5 ** ((x - mn) / (mx - mn)) )
+    X = df[std_pred_cols].values
+    mu = df.log_price.mean()
+    sd = df.log_price.std()
+    df['std_log_price'] = (df.log_price - mu) / sd
+    # y = df.std_log_price.values
+    # y = df[target_col].values
+    # y = df.rel_price_1.values
+    y_0 = df.rel_price_0.values
+    y_1 = df.rel_price_1.values
+    # y_log = df.log_price.values
+
+    clf_lin = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
+    clf_lin.fit(X, y_0, df.weight.values)
+    coefs = []
+    for a, b in zip(std_pred_cols, clf_lin.coef_):
+        coefs += [[a,b]]
+    coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
+    coefs.to_csv('~/Downloads/tmp.csv', index=False)
+    df['pred_lin'] = clf_lin.predict(X)
+    df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
+    df['err_lin'] = abs(((df.pred_lin - df[target_col]) / df[target_col]) )
+    # df['err_lin'] = abs(df.pred_lin - df.price )
+    # df[[ 'price','pred_lin','err_lin','mn_20' ]].sort_values('err_lin').tail(50)
+    df.head()
+    clf_log = Lasso() if collection in [ 'Levana Dragon Eggs' ] else RidgeCV(alphas=[1.5**x for x in range(20)])
+    clf_log.fit(X, y_1, df.weight.values)
+    coefs = []
+    for a, b in zip(std_pred_cols, clf_log.coef_):
+        coefs += [[a,b]]
+    coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
+    coefs.to_csv('~/Downloads/tmp.csv', index=False)
+    df['pred_log'] = clf_log.predict(X)
+    df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
+    df['err_log'] = abs(((df.pred_log - df[target_col]) / df[target_col]) )
+    df[[ target_col,'pred_log','err_log','mn_20' ]].sort_values('err_log').tail(50)
+    df['err'] = df.err_lin * df.err_log
+
+    df[[ target_col,'pred_log','err_log','err_lin','err','mn_20' ]].sort_values('err').tail(50)
+    df['collection'] = collection
+
+    # df['pred_lin'] = clf_lin.predict(X)
+    # df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
+    # df['pred_log'] = np.exp(clf_log.predict(X))
+    # df['pred_log'] = clf_log.predict(X)
+    # df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
+    clf = LinearRegression(fit_intercept=False)
+    clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.weight.values )
+    print('Price = {} * lin + {} * log'.format( round(clf.coef_[0], 2), round(clf.coef_[1], 2) ))
+    l = df.sort_values('block_timestamp', ascending=0).mn_20.values[0]
+    tmp = pd.DataFrame([[collection, clf.coef_[0], clf.coef_[1], l]], columns=['collection','lin_coef','log_coef','floor_price'])
+    if clf.coef_[0] < 0:
+        print('Only using log')
+        df['pred'] = df.pred_log
+        tmp['lin_coef'] = 0
+        tmp['log_coef'] = 1
+    elif clf.coef_[1] < 0:
+        print('Only using lin')
+        df['pred'] = df.pred_lin
+        tmp['lin_coef'] = 1
+        tmp['log_coef'] = 0
+    else:
+        print('Only using BOTH!')
+        df['pred'] = clf.predict( df[['pred_lin','pred_log']].values )
+    coefsdf = coefsdf.append(tmp)
+    df['err'] = (df.pred / df[target_col]).apply(lambda x: abs(x-1) )
+    df[df.block_timestamp>='2021-10-01'].sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'token_id','block_timestamp','err','mn_20' ]]
+    # df[df.block_timestamp>='2021-10-01'].err.mean()
+    df.merge(tokens[['collection','token_id','clean_token_id']]).sort_values('err', ascending=0).head(10)[[ 'pred',target_col,'clean_token_id','rank','block_timestamp','err','mn_20','tx_id' ]]
+    df.sort_values('price', ascending=0).head(20)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
+    df.sort_values('price', ascending=0).tail(40)[[ 'price','pred',target_col,'token_id','block_timestamp','err','mn_20','tx_id' ]]
+    df.sort_values('price', ascending=0).head(20).tx_id.values
+
+    # print(np.mean(y))
+    # print(np.mean(clf.predict(X)))
+
+    # # run neural net
+    # model = tf.keras.models.Sequential([
+    #     tf.keras.layers.Dense(9, activation='relu')
+    #     , tf.keras.layers.Dropout(.2)
+    #     , tf.keras.layers.Dense(3, activation='relu')
+    #     , tf.keras.layers.Dropout(.2)
+    #     , tf.keras.layers.Dense(1, activation='linear')
+    # ])
+    # model.compile(loss='mae', optimizer=tf.keras.optimizers.SGD(learning_rate=0.0025))
+    # model.fit(X, y, epochs=500, validation_split=0.3)
+
+    # df['pred'] = np.exp( (sd * model.predict(df[std_pred_cols].values)) + mu)
+    # df['pred'] = model.predict(df[std_pred_cols].values)
+    # ratio = df.price.mean() / df.pred.mean()
+    # print("Manually increasing predictions by {}%".format(round((ratio-1) * 100, 1)))
+
+    # checking errors
+    # df['pred'] = df.pred * ratio
+    df['err'] = df[target_col] - df.pred
+    df['q'] = df.pred.rank() * 10 / len(df)
+    df['q'] = df.q.apply(lambda x: int(round(x)) )
+    df['pct_err'] = (df[target_col] / df.pred) - 1
+    pe_mu = df.pct_err.mean()
+    pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) ].pct_err.std()
+    pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) & (df.days_ago<=50) ].pct_err.std()
+    df['pred_price'] = df.pred#.apply(lambda x: x*(1+pe_mu) )
+    df['pred_sd'] = df.pred * pe_sd
+    print(df.groupby('q')[['err','pred',target_col]].mean())
+    print(df[df.weight >= df.weight.median()].groupby('q')[['err','pred',target_col]].mean())
+    # df.err.mean()
+    # df[df.weight >= 3.5].err.mean()
+    df['collection'] = collection
+    print('Avg err last 100: {}'.format(round(df.sort_values('block_timestamp').head(100).err.mean(), 2)))
+    salesdf = salesdf.append( df[[ 'collection','contract_address','token_id','block_timestamp','price','pred','mn_20','rank','score' ]].sort_values('block_timestamp', ascending=0) )
+
+    # create the attributes dataframe
+    for f in p_features:
+        if f and '{}_pct'.format(f) in p_metadata.columns:
+            cur = p_metadata[[ 'token_id', f, '{}_pct'.format(f) ]]
+            cur.columns = [ 'token_id', 'value','rarity' ]
+            cur['feature'] = f
+            cur['collection'] = collection
+            attributes = attributes.append(cur)
+
+    # create predictions for each NFT in the collection
+    test = p_metadata.copy()
+    for c in n_cols:
+        test[c] = test[c].apply(lambda x: just_float(x) )
+    if collection in [ 'Levana Dragon Eggs' ]:
+        test['transformed_collection_rank'] = test.collection_rank.apply(lambda x: (1.0 / x) ** 2 )
+    tail = df.sort_values('timestamp').tail(1)
+    for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
+        if c in tail.columns:
+            test[c] = tail[c].values[0]
+    test = standardize_df(test, [c for c in p_pred_cols if not c in ['timestamp'] ], df, True)
+    # test['pred_lin'] = clf_lin.predict( test[std_pred_cols].values )
+    # test['pred_log'] = np.exp(clf_log.predict( test[std_pred_cols].values ))
+
+    test['pred_lin'] = clf_lin.predict(test[std_pred_cols].values)
+    test['pred_lin'] = test.pred_lin.apply(lambda x: max(0, x) + l)
+    # test['pred_lin'] = df.pred_lin + df.mn_20
+    # df['pred_log'] = np.exp(clf_log.predict(X))
+    test['pred_log'] = clf_log.predict(test[std_pred_cols].values)
+    test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * l
+
+    test['pred'] = clf.predict( test[[ 'pred_lin','pred_log' ]].values )
+    # test['pred'] = np.exp( (sd * model.predict(test[std_pred_cols].values)) + mu) * ratio
+    test['pred_price'] = test.pred#.apply(lambda x: x*(1+pe_mu) )
+    if not CHECK_EXCLUDE:
+        test['pred_price'] = test.pred.apply(lambda x: (x*0.985) )
+    test['pred_sd'] = test.pred * pe_sd
+    test['rk'] = test.pred.rank(ascending=0, method='first')
+    test['collection'] = collection
+    pred_price = pred_price.append( test[[ 'collection', 'contract_address','token_id','rank','rk','pred_price','pred_sd' ] + p_features].rename(columns={'rank':'hri_rank'}).sort_values('pred_price') )
+    # print(test[[ 'contract_address','token_id','pred_price','pred_sd' ]].sort_values('pred_price'))
+
+
+    ##############################
+    #     Feature Importance     #
+    ##############################
+    coefs = []
+    for a, b, c in zip(p_pred_cols, clf_lin.coef_, clf_log.coef_):
+        coefs += [[ collection, a, b, c ]]
+    coefs = pd.DataFrame(coefs, columns=['collection','col','lin_coef','log_coef'])
+    # coefs['feature'] = coefs.col.apply(lambda x: ' '.join(re.split('_', x)[:-1]).title() )
+    # coefs['feature'] = coefs.col.apply(lambda x: '_'.join(re.split('_', x)[:-1]) )
+    # coefs['value'] = coefs.col.apply(lambda x: re.split('_', x)[-1] )
+    # mn = coefs.groupby('feature')[[ 'lin_coef','log_coef' ]].min().reset_index()
+    # mn.columns = [ 'feature','mn_lin_coef','mn_log_coef' ]
+    # coefs = coefs.merge(mn)
+    # coefs['lin_coef'] = coefs.lin_coef - coefs.mn_lin_coef
+    # coefs['log_coef'] = coefs.log_coef - coefs.mn_log_coef
+    # coefs
+    # g = attributes[ attributes.collection == collection ][[ 'feature','value','rarity' ]].drop_duplicates()
+    # g['value'] = g.value.astype(str)
+    # len(coefs)
+    # g = coefs.merge(g, how='left')
+    # g[g.rarity.isnull()]
+    # len(g)
+    # coefs = coefs.merge( m_df[ m_df.collection == collection ][[ 'feature_name','' ]] )
+    # coefs.sort_values('lin_coef').tail(20)
+
+    # TODO: pick the most common one and have that be the baseline
+    most_common = attributes[(attributes.collection == collection)].sort_values('rarity', ascending=0).groupby('feature').head(1)
+    most_common['col'] = most_common.apply(lambda x: 'std_{}_{}'.format( re.sub(' ', '_', x['feature'].lower()), x['value'] ), 1 )
+    mc = most_common.col.unique()
+    data = []
+    for c0 in std_pred_cols_0:
+        if c0 in ['std_rank','std_score','std_pct','std_timestamp','std_mn_20','std_log_mn_20']:
+            continue
+        f = '_'.join(re.split('_', c0)[1:-1])
+        v = re.split('_', c0)[-1]
+        rarity = p_metadata[p_metadata['{}_{}'.format(f, v)]==1]['{}_pct'.format(f)].values[0]
+        # avg = p_metadata['{}_pct'.format(f)].mean()
+        # avg_pct = df.pct.mean()
+        # pct_std = ((avg_pct * r / avg) - avg_pct) / df.pct.std()
+        r = df[df['{}_{}'.format(f, v)]==1].std_rank.mean()
+        s = df[df['{}_{}'.format(f, v)]==1].std_score.mean()
+        if r == r and s == s:
+            datum = [ c0, rarity ]
+            for c1 in std_pred_cols:
+                datum.append(1 if c1 == c0 else r if c1 == 'std_rank' else s if c1 == 'std_score' else 1 if c1 in mc else 0 )
+            data += [ datum ]
+
+    importance = pd.DataFrame(data, columns=['feature','rarity']+std_pred_cols)
+    sorted(importance.feature.unique())
+    importance[importance.feature == 'std_fur_/_skin_Leopard']
+    if 'std_timestamp' in df.columns:
+        importance['std_timestamp'] = df.std_timestamp.max()
+    # importance['pred_lin'] = clf_lin.predict( importance[std_pred_cols].values )
+    # importance['pred_log'] = np.exp(clf_log.predict( importance[std_pred_cols].values ))
+
+    importance['pred_lin'] = clf_lin.predict(importance[std_pred_cols].values)
+    importance['pred_lin'] = importance.pred_lin.apply(lambda x: max(0, x) + l)
+    # importance['pred_lin'] = importance.pred_lin.apply(lambda x: x + l)
+    importance['pred_log'] = clf_log.predict(importance[std_pred_cols].values)
+    importance['pred_log'] = importance.pred_log.apply(lambda x: max(1, x)) * l
+    # importance['pred_log'] = importance.pred_log.apply(lambda x: x) * l
+
+    importance['pred'] = clf.predict( importance[[ 'pred_lin','pred_log' ]].values )
+    # importance['pred'] = np.exp( (sd * model.predict(importance[std_pred_cols].values)) + mu)
+    importance = importance.sort_values('pred', ascending=0)
+    importance.head()[['feature','pred']]
+    importance[importance.feature == 'std_fur_/_skin_Leopard']
+    importance['feature'] = importance.feature.apply(lambda x: re.sub('std_', '', x))
+    importance['value'] = importance.feature.apply(lambda x: re.split('_', x)[-1])
+    importance['feature'] = importance.feature.apply(lambda x: '_'.join(re.split('_', x)[:-1]))
+    mn = importance.groupby('feature').pred.min().reset_index().rename(columns={'pred':'baseline'})
+    importance = importance.merge(mn)
+    importance['pred_vs_baseline'] = importance.pred - importance.baseline
+    importance['pct_vs_baseline'] = (importance.pred / importance.baseline) - 1
+    importance[(importance.feature == 'fur_/_skin')].sort_values('pred')[['value','rarity','pred','pred_lin','pred_log','std_rank','std_score']].sort_values('rarity')
+    importance['collection'] = collection
+    importance.sort_values('pct_vs_baseline')[['feature','value','pct_vs_baseline']]
+    tmp = importance[std_pred_cols].mean().reset_index()
+    tmp.columns = [ 'a', 'b' ]
+    tmp = tmp.sort_values('b')
+    feature_values = feature_values.append(importance[['collection','feature','value','pred','pred_vs_baseline','pct_vs_baseline','rarity']])
+
+attributes['feature'] = attributes.feature.apply(lambda x: re.sub('_', ' ', x).title() )
+feature_values['feature'] = feature_values.feature.apply(lambda x: re.sub('_', ' ', x).title() )
+
+pred_price = pred_price[[ 'collection', 'contract_address', 'token_id', 'hri_rank', 'rk', 'pred_price', 'pred_sd' ]]
+
+
+coefsdf.to_csv('./data/coefsdf.csv', index=False)
+salesdf.to_csv('./data/model_sales.csv', index=False)
+pred_price.to_csv('./data/pred_price.csv', index=False)
+attributes.to_csv('./data/attributes.csv', index=False)
+feature_values.to_csv('./data/feature_values.csv', index=False)
+
+pred_price = pd.read_csv('./data/pred_price.csv')
+tokens = pd.read_csv('./data/tokens.csv')
+rem = tokens[tokens.clean_token_id>=10000].token_id.unique()
+l0 = len(pred_price)
+pred_price = pred_price[ -((pred_price.collection == 'LunaBulls') & (pred_price.token_id.isin(rem))) ]
+l1 = len(pred_price)
+pred_price.to_csv('./data/pred_price.csv', index=False)
+
+# listings = pd.read_csv('./data/listings.csv')
+# listings['token_id'] = listings.token_id.astype(int)
+
+# tmp = salesdf.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
+# tmp = pred_price.merge(attributes[ (attributes.collection == 'thugbirdz') & (attributes.feature == 'Position In Gang') & (attributes.value == 'Underboss') ])
+# tmp['token_id'] = tmp.token_id.astype(int)
+# tmp = tmp.merge(listings[['collection','token_id','price']])
+# tmp.sort_values('pred_price', ascending=0)
+
+if CHECK_EXCLUDE:
+    salesdf['rat'] = salesdf.price / salesdf.pred
+    salesdf['dff'] = salesdf.price - salesdf.pred
+    salesdf['exclude_1'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
+    salesdf['rat'] = salesdf.pred / salesdf.price
+    salesdf['dff'] = salesdf.pred - salesdf.price
+    salesdf['exclude_2'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
+    salesdf['exclude'] = (salesdf.exclude_1 + salesdf.exclude_2).apply(lambda x: int(x>0))
+    print(salesdf.exclude_1.mean())
+    print(salesdf.exclude_2.mean())
+    print(salesdf.exclude.mean())
+    salesdf[salesdf.token_id == '2239'][['collection','price','exclude']]
+    salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]].to_csv('./data/exclude.csv', index=False)
+
+attributes[ (attributes.collection == 'thugbirdz') & (attributes.token_id == '1869') ]
+feature_values[ (feature_values.collection == 'thugbirdz') & (feature_values.feature == 'position_in_gang') ]
+sorted(feature_values[ (feature_values.collection == 'thugbirdz') ].feature.unique())
+
+pred_price[pred_price.collection == 'peskypenguinclub'].head()
--- a/viz/server.R
+++ b/viz/server.R
@ -45,7 +45,7 @@ server <- function(input, output, session) {
 		selectInput(
 			inputId = 'collectionname'
 			, label = NULL
-			, selected = 'LunaBulls'
+			, selected = 'Levana Dragon Eggs'
 			, choices = choices
 			, width = "100%"
 		)
@ -197,24 +197,27 @@ server <- function(input, output, session) {
 			return(head(attributes, 0))
 		}
 		cur <- attributes[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]
-		cur <- merge( cur, feature_values[collection == eval(selected), list(feature, value, pred_vs_baseline, pct_vs_baseline) ], all.x=TRUE )
+		# cur <- merge( cur, feature_values[collection == eval(selected), list(feature_name, feature_value, pred_vs_baseline, pct_vs_baseline) ], all.x=TRUE )
 		cur <- cur[order(rarity)]
-		floor <- getFloors()[2]
-		log_coef <- coefsdf[ collection == eval(selected) ]$log_coef[1]
-		lin_coef <- coefsdf[ collection == eval(selected) ]$lin_coef[1]
-		s <- sum(cur$pct_vs_baseline)
-		p <- getPredPrice()
-		p <- as.numeric(p[ token_id == eval(as.numeric(id)) ]$pred_price)
-		# p <- pred_price[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]$pred_price
-		ratio <- (p / floor) - 1
-		ratio <- pmax(0, ratio)
-		if (ratio > 0 & length(ratio) > 0) {
-			mult <- ratio / s
-			cur[, pct_vs_baseline := pct_vs_baseline * eval(mult) ]
-		}
-		cur[, vs_baseline := round((pred_vs_baseline * eval(lin_coef)) + (pct_vs_baseline * eval(floor) * eval(log_coef) ), 1) ]
-		cur[, pred_vs_baseline := round(pred_vs_baseline, 1) ]
-		cur[, vs_baseline := round(pred_vs_baseline + (pct_vs_baseline * eval(floor)), 1) ]
+		# floor <- getFloors()[2]
+		# log_coef <- coefsdf[ collection == eval(selected) ]$log_coef[1]
+		# lin_coef <- coefsdf[ collection == eval(selected) ]$lin_coef[1]
+		# s <- sum(cur$pct_vs_baseline)
+		# p <- getPredPrice()
+		# p <- as.numeric(p[ token_id == eval(as.numeric(id)) ]$pred_price)
+		# # p <- pred_price[ token_id == eval(as.numeric(id)) & collection == eval(selected) ]$pred_price
+		# ratio <- (p / floor) - 1
+		# ratio <- pmax(0, ratio)
+		# if (ratio > 0 & length(ratio) > 0) {
+		# 	mult <- ratio / s
+		# 	cur[, pct_vs_baseline := pct_vs_baseline * eval(mult) ]
+		# }
+		cur[, vs_baseline := 0 ]
+		cur[, pred_vs_baseline := 0 ]
+		cur[, vs_baseline := 0 ]
+		# cur[, vs_baseline := round((pred_vs_baseline * eval(lin_coef)) + (pct_vs_baseline * eval(floor) * eval(log_coef) ), 1) ]
+		# cur[, pred_vs_baseline := round(pred_vs_baseline, 1) ]
+		# cur[, vs_baseline := round(pred_vs_baseline + (pct_vs_baseline * eval(floor)), 1) ]
 		return(cur)
 	})

@ -223,9 +226,11 @@ server <- function(input, output, session) {
 		if( nrow(data) == 0 ) {
 			return(NULL)
 		}
-		data[, rarity := paste0(format(round(rarity*100, 2), digits=4, decimal.mark="."),'%') ]
+		data[, rarity := ifelse(is.na(rarity), '', paste0(format(round(rarity*100, 2), digits=4, decimal.mark="."),'%') )]
+
 		# reactable(data[, list( feature, value, rarity, vs_baseline, pred_vs_baseline, pct_vs_baseline )],
-		data <- data[, list( feature, value, rarity, pct_vs_baseline )]
+		# data <- data[, list( feature, value, rarity, pct_vs_baseline )]
+		data <- data[, list( feature_name, feature_value, rarity )]
 		reactable(data,
 			defaultColDef = colDef(
 				headerStyle = list(background = "#10151A")
@ -234,17 +239,17 @@ server <- function(input, output, session) {
 			borderless = TRUE,
 			outlined = FALSE,
 			columns = list(
-				feature = colDef(name = "Attribute", align = "left"),
-				value = colDef(name = "Value", align = "left"),
-				rarity = colDef(name = "Rarity", align = "left"),
-				pct_vs_baseline = colDef(
-					name="Value", header=with_tooltip("Value", "The estimated price impact of this feature vs the floor")
-					, html = TRUE
-					, align = "left"
-					, cell = function(x) {
-						htmltools::tags$span(paste0('+', format(round(x*1000)/10, digits=4, decimal.mark=".", big.mark=","), '%'))
-					}
-				)
+				feature_name = colDef(name = "Attribute", align = "left"),
+				feature_value = colDef(name = "Value", align = "left"),
+				rarity = colDef(name = "Rarity", align = "left")
+				# pct_vs_baseline = colDef(
+				# 	name="Value", header=with_tooltip("Value", "The estimated price impact of this feature vs the floor")
+				# 	, html = TRUE
+				# 	, align = "left"
+				# 	, cell = function(x) {
+				# 		htmltools::tags$span(paste0('+', format(round(x*1000)/10, digits=4, decimal.mark=".", big.mark=","), '%'))
+				# 	}
+				# )
 			)
 	    )
 	})
@ -255,7 +260,7 @@ server <- function(input, output, session) {
 			return(NULL)
 		}
 		data <- feature_values[ collection == eval(selected)]
-		reactable(data[, list( feature, value, rarity, pct_vs_baseline )],
+		reactable(data[, list( feature_name, feature_value, rarity, pct_vs_baseline )],
 			defaultColDef = colDef(
 				headerStyle = list(background = "#10151A")
 			),
@ -263,8 +268,8 @@ server <- function(input, output, session) {
 			outlined = FALSE,
 			searchable = TRUE,
 			columns = list(
-				feature = colDef(name = "Attribute", align = "left"),
-				value = colDef(name = "Value", align = "left"),
+				feature_name = colDef(name = "Attribute", align = "left"),
+				feature_value = colDef(name = "Value", align = "left"),
 				rarity = colDef(name = "Rarity", align = "left", cell = function(x) {
 					htmltools::tags$span(paste0(format(x*100, digits=3, decimal.mark=".", big.mark=","),'%'))
 				}),
@ -504,7 +509,9 @@ server <- function(input, output, session) {
 		df[, deal_score := round(pmin( 100, pmax(0, deal_score) ))  ]
 		df[, deal_score := pnorm(price, pred_price, eval(SD_SCALE) * pred_sd * pred_price / pred_price_0), by = seq_len(nrow(df)) ]
 		df[, deal_score := round(100 * (1 - deal_score)) ]
-		df[, pred_price := round(pred_price) ]
+		# df[, pred_price := round(pred_price) ]
+		df[, pred_price := paste0(format(round(pred_price, 1), digits=3, decimal.mark=".", big.mark=",")) ]
+
 		df <- df[, list(token_id, price, pred_price, deal_score)]
 		df <- df[order(-deal_score)]
 		return(df)
@ -517,7 +524,7 @@ server <- function(input, output, session) {
 		if( nrow(df) == 0 ) {
 			return(NULL)
 		}
-		df <- df[ deal_score >= 10 ]
+		df <- df[ deal_score >= 0 ]
 		df[, hover_text := paste0('<b>#',token_id,'</b><br>Listing Price: ',price,'<br>Fair Market Price: ',pred_price,'<br>Deal Score: ',deal_score) ]

 		fig <- plot_ly(
--- a/viz/ui.R
+++ b/viz/ui.R
@ -102,17 +102,33 @@ fluidPage(
 	, fluidRow(
 		class="grey8row"
 		, h2("Listings", icon(class="padding-left-10", id="listings-tooltip", "info-circle"))
-		, bsTooltip(id = "listings-tooltip", title = "Plot only shows listings with deal score > 10; Click a dot to select the token", placement = "bottom", trigger = "hover")
+		, bsTooltip(id = "listings-tooltip", title = "Plot only shows listings with deal score > 5; Click a dot to select the token", placement = "bottom", trigger = "hover")
 		, div(
 			class = "listing-plot"
 			, plotlyOutput("listingplot", height = 500)
-			, div(class='description', 'Plot only shows listings with deal score > 10')
+			, div(class='description', 'Plot only shows listings with deal score > 5')
 			, div(class='description', 'Click a dot to select the token')
 		)
 		, div(class = "table", reactableOutput("listingtable"))
 		, div(class = "description", 'This app is still in beta - listings updates will be periodic (but at least 3x a week)')
 		, div(class = "link", uiOutput('listingurl'))
 	)
+	, fluidRow(
+		class="grey8row faq"
+		, h2("FAQ")
+		, h4("What is NFT Deal Score?")
+		, div("We use historical sales data to determine the values and the rankings of each NFT.")
+		, h4("Why is this rank different?")
+		, div("Although rarity is a feature in our model, it is not just a rarity-based ranking. Certain features are put at a higher premium on the secondary marketplace, and this ranking reflects that.")
+		, h4("Why are the rarity %s different?")
+		, div("Our %s reflect only the NFTs in existence. Other tools may include more theoretical numbers.")
+		, h4("How does the model work?")
+		, div("Each attribute is an input into the model. We are working to add better model explanations to the tool.")
+		, h4("How often is the data updated?")
+		, div("Listings are updated 3x / week. Model is updated weekly.")
+		, h4("Where can I send my questions?")
+		, div(a(class="", href="https://twitter.com/nftdealscore", "@nftdealscore"), " on Twitter")
+	)
 	, fluidRow(
 		class="grey8row"
 		, h2("NFT Rankings", icon(class="padding-left-10", id="nft-rankings-tooltip", "info-circle"))
@ -127,11 +143,11 @@ fluidPage(
 		, div(class = "table", reactableOutput("salestable"))
 		, div(class = "description", 'This app is still in beta - sales data may be incomplete or delayed')
 	)
-	, fluidRow(
-		class="grey8row"
-		, h2("Feature Summary", icon(class="padding-left-10", id="feature-summary-tooltip", "info-circle"))
-		, bsTooltip(id = "feature-summary-tooltip", title = "Shows the rarity and estimated price impact of each feature", placement = "bottom", trigger = "hover")
-		, div(class = "table", reactableOutput("featurestable"))
-		, div(class = "description", 'Shows the rarity and estimated price impact of each feature')
-	)
+	# , fluidRow(
+	# 	class="grey8row"
+	# 	, h2("Feature Summary", icon(class="padding-left-10", id="feature-summary-tooltip", "info-circle"))
+	# 	, bsTooltip(id = "feature-summary-tooltip", title = "Shows the rarity and estimated price impact of each feature", placement = "bottom", trigger = "hover")
+	# 	, div(class = "table", reactableOutput("featurestable"))
+	# 	, div(class = "description", 'Shows the rarity and estimated price impact of each feature')
+	# )
 )
--- a/viz/www/styles.css
+++ b/viz/www/styles.css
@ -264,6 +264,15 @@ tr {
 }


+/****************/
+/*     FAQ     */
+/****************/
+.faq > h4 {
+  font-size: 22px;
+  padding-top: 32px;
+}
+
+
 /*******************/
 /*     General     */
 /*******************/