mirror of
https://github.com/FlipsideCrypto/nft-deal-score.git
synced 2026-02-06 10:56:58 +00:00
723 lines
37 KiB
Python
723 lines
37 KiB
Python
import collections
|
|
import os
|
|
import re
|
|
import json
|
|
from textwrap import indent
|
|
import warnings
|
|
import requests
|
|
import numpy as np
|
|
import pandas as pd
|
|
import kutils as ku
|
|
import urllib.request
|
|
import tensorflow as tf
|
|
import snowflake.connector
|
|
|
|
from curses import meta
|
|
from copy import deepcopy
|
|
from datetime import datetime
|
|
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, Ridge
|
|
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
|
|
|
|
os.chdir('/Users/kellenblumberg/git/nft-deal-score')
|
|
from scrape_sol_nfts import clean_name
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
###################################
|
|
# Define Helper Functions #
|
|
###################################
|
|
def standardize_df(df, cols, usedf=None, verbose=False):
|
|
for c in cols:
|
|
if type(usedf) != type(pd.DataFrame()):
|
|
usedf = df
|
|
mu = usedf[c].mean()
|
|
sd = usedf[c].std()
|
|
if verbose:
|
|
print(c)
|
|
if len(df[c].unique()) == 2 and df[c].max() == 1 and df[c].min() == 0:
|
|
# df['std_{}'.format(c)] = df[c].apply(lambda x: (x*2) - 1 )
|
|
df['std_{}'.format(c)] = df[c]
|
|
else:
|
|
df['std_{}'.format(c)] = (df[c] - mu) / sd
|
|
return(df)
|
|
|
|
def merge(left, right, on=None, how='inner', ensure=True, verbose=True, message = ''):
|
|
df = left.merge(right, on=on, how=how)
|
|
if len(df) != len(left) and (ensure or verbose):
|
|
if message:
|
|
print(message)
|
|
print('{} -> {}'.format(len(left), len(df)))
|
|
cur = left.merge(right, on=on, how='left')
|
|
cols = set(right.columns).difference(set(left.columns))
|
|
print(cols)
|
|
if ensure:
|
|
col = list(cols)[0]
|
|
missing = cur[cur[col].isnull()]
|
|
print(missing.head())
|
|
assert(False)
|
|
return(df)
|
|
|
|
def just_float(x):
|
|
x = re.sub('[^\d\.]', '', str(x))
|
|
return(float(x))
|
|
|
|
def calculate_percentages(df, cols=[]):
|
|
add_pct = not 'pct' in df.columns
|
|
if not len(cols):
|
|
cols = df.columns
|
|
if add_pct:
|
|
df['pct'] = 1
|
|
for c in cols:
|
|
g = df[c].value_counts().reset_index()
|
|
g.columns = [ c, 'N' ]
|
|
col = '{}_pct'.format(c)
|
|
g[col] = g.N / g.N.sum()
|
|
df = df.merge( g[[ c, col ]] )
|
|
if add_pct:
|
|
df['pct'] = df.pct * df[col]
|
|
return(df)
|
|
|
|
def get_sales(check_exclude = True, exclude=[]):
|
|
|
|
s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
|
|
s_df['token_id'] = s_df.token_id.astype(str)
|
|
s_df['collection'] = s_df.collection.apply(lambda x: clean_name(x))
|
|
s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
|
|
s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
|
|
s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
|
|
for e in exclude:
|
|
s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
|
|
s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
|
|
|
|
# exclude wierd data points
|
|
if not check_exclude:
|
|
exclude = pd.read_csv('./data/exclude.csv')
|
|
exclude['collection'] = exclude.collection.apply(lambda x: clean_name(x))
|
|
exclude['token_id'] = exclude.token_id.astype(str)
|
|
s_df = s_df.merge(exclude, how='left')
|
|
s_df = s_df[s_df.exclude.isnull()]
|
|
del s_df['exclude']
|
|
|
|
###########################
|
|
# Calculate Floor #
|
|
###########################
|
|
s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
|
|
s_df['timestamp'] = s_df.block_timestamp.astype(int)
|
|
s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
|
|
|
|
# lowest price in last 20 sales
|
|
s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
|
|
s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
|
|
|
|
# exclude sales that are far below the existing floor
|
|
s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.70) ]
|
|
|
|
# 10%ile of last 20 sales
|
|
s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
|
|
s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.0525).reset_index(0,drop=True)
|
|
s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.0525).reset_index(0,drop=True)
|
|
s_df['sim'] = 0
|
|
s_df['tmp'] = s_df.block_timestamp.apply(lambda x: str(x)[:10] )
|
|
s_df.groupby(['collection','tmp']).mn_20.mean().reset_index().to_csv('~/Downloads/mn_20.csv', index=False)
|
|
return(s_df)
|
|
|
|
def get_coefs(cols, coef):
|
|
coefs = []
|
|
for a, b in zip(cols, coef):
|
|
coefs += [[a,b]]
|
|
coefs = pd.DataFrame(coefs, columns=['col','coef']).sort_values('coef', ascending=0)
|
|
# coefs.to_csv('~/Downloads/{}_lin_coefs.csv'.format(collection), index=False)
|
|
# coefs['tmp'] = coefs.col.apply(lambda x: 'nft_rank' in x )
|
|
# coefs['mult'] = coefs.col.apply(lambda x: -1 if x == 'std_nft_rank' else 1 )
|
|
coefs['mult'] = coefs.apply(lambda x: -1 if x['col'] == 'std_nft_rank' else 1 if x['coef'] >= 0 or 'adj_nft_rank' in x['col'] or 'is_top_' in x['col'] or 'y_pred_' in x['col'] else -1 , 1 )
|
|
coefs['val'] = coefs.mult * coefs.coef
|
|
coefs = coefs.sort_values('val', ascending=0)
|
|
return(coefs)
|
|
|
|
def train_model(check_exclude, supplement_with_listings):
|
|
exclude = [
|
|
( 'aurory', 2239, 3500 )
|
|
, ( 'aurory', 1876, 789 )
|
|
, ( 'aurory', 2712, 500 )
|
|
, ( 'aurory', 5368, 500 )
|
|
, ( 'aurory', 9239, 1700 )
|
|
]
|
|
s_df = get_sales(check_exclude, exclude)
|
|
# s_df = pd.read_csv('./data/sales.csv').rename(columns={'sale_date':'block_timestamp'})
|
|
# s_df['collection'] = s_df.collection.apply(lambda x: clean_name(x))
|
|
# s_df = s_df[-s_df.collection.isin(['Levana Meteors','Levana Dust'])]
|
|
# s_df = s_df[ -s_df.collection.isin(['boryokudragonz', 'Boryoku Dragonz']) ]
|
|
# s_df = s_df[[ 'chain','collection','block_timestamp','token_id','price','tx_id' ]]
|
|
# for e in exclude:
|
|
# s_df = s_df[-( (s_df.collection == e[0]) & (s_df.token_id == e[1]) & (s_df.price == e[2]) )]
|
|
# s_df = s_df[ -((s_df.collection == 'smb') & (s_df.price < 1)) ]
|
|
|
|
# # exclude wierd data points
|
|
# if not check_exclude:
|
|
# exclude = pd.read_csv('./data/exclude.csv')
|
|
# exclude['collection'] = exclude.collection.apply(lambda x: clean_name(x))
|
|
# s_df = s_df.merge(exclude, how='left')
|
|
# s_df = s_df[s_df.exclude.isnull()]
|
|
# del s_df['exclude']
|
|
|
|
#########################
|
|
# Load Metadata #
|
|
#########################
|
|
m_df = pd.read_csv('./data/metadata.csv')
|
|
# m_df[m_df.collection == 'Aurory'][['collection','feature_name']].drop_duplicates().to_csv('~/Downloads/tmp.csv', index=False)
|
|
sorted([x for x in m_df.feature_name.unique() if 'nft_' in x])
|
|
m_df['token_id'] = m_df.token_id.astype(str)
|
|
m_df['collection'] = m_df.collection.apply(lambda x: clean_name(x))
|
|
# remove ones that are not actually metadata
|
|
m_df = m_df[ -m_df.feature_name.isin([ 'price','last_sale','feature_name','feature_value' ]) ]
|
|
m_df['feature_value'] = m_df.feature_value.apply(lambda x: re.split("\(", re.sub("\"", "", x))[0] if type(x)==str else x )
|
|
m_df[(m_df.feature_name=='rank') & (m_df.collection == 'Levana Dragon Eggs')]
|
|
sorted(m_df[ (m_df.collection == 'Solana Monkey Business') ].feature_name.unique())
|
|
|
|
|
|
#####################################
|
|
# Exclude Special LunaBulls #
|
|
#####################################
|
|
tokens = pd.read_csv('./data/tokens.csv')
|
|
tokens['collection'] = tokens.collection.apply(lambda x: clean_name(x))
|
|
tokens['token_id'] = tokens.token_id.astype(str)
|
|
m_df = merge(m_df, tokens[['collection','token_id','clean_token_id']], how='left', ensure=True, on=['collection','token_id'], message='m_df x tokens')
|
|
m_df['token_id'] = m_df.clean_token_id.fillna(m_df.token_id).astype(int).astype(str)
|
|
s_df = merge(s_df, tokens[['collection','token_id','clean_token_id']], how='left', ensure=True, on=['collection','token_id'], message='s_df x tokens')
|
|
s_df['token_id'] = s_df.clean_token_id.fillna(s_df.token_id).astype(int).astype(str)
|
|
tokens.token_id.unique()
|
|
lunabullsrem = tokens[tokens.clean_token_id>=10000].token_id.unique()
|
|
m_df = m_df[ -((m_df.collection == 'LunaBulls') & (m_df.token_id.isin(lunabullsrem))) ]
|
|
s_df = s_df[ -((s_df.collection == 'LunaBulls') & (s_df.token_id.isin(lunabullsrem))) ]
|
|
s_df = s_df.drop_duplicates(subset=['collection','token_id','price'])
|
|
|
|
|
|
###########################
|
|
# Calculate Floor #
|
|
###########################
|
|
# s_df['block_timestamp'] = s_df.block_timestamp.apply(lambda x: datetime.strptime(str(x)[:19], '%Y-%m-%d %H:%M:%S') if len(x) > 10 else datetime.strptime(x[:10], '%Y-%m-%d') )
|
|
# s_df['timestamp'] = s_df.block_timestamp.astype(int)
|
|
# s_df['days_ago'] = s_df.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
|
|
|
|
# # lowest price in last 20 sales
|
|
# s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
# s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
|
|
# s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
# s_df['md_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.01).reset_index(0,drop=True)
|
|
|
|
# # exclude sales that are far below the existing floor
|
|
# s_df = s_df[ (s_df.price) >= (s_df.md_20 * 0.70) ]
|
|
|
|
# # 10%ile of last 20 sales
|
|
# s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
# s_df['mn_20'] = s_df.groupby('collection').price.shift(1)
|
|
# s_df = s_df.sort_values(['collection','block_timestamp'])
|
|
# s_df['mn_20'] = s_df.groupby('collection')['mn_20'].rolling(20).quantile(.1).reset_index(0,drop=True)
|
|
# s_df['sim'] = 0
|
|
# s_df['tmp'] = s_df.block_timestamp.apply(lambda x: str(x)[:10] )
|
|
# s_df.groupby(['collection','tmp']).mn_20.mean().reset_index().to_csv('~/Downloads/mn_20.csv', index=False)
|
|
|
|
listings = pd.read_csv('./data/listings.csv')
|
|
if supplement_with_listings:
|
|
pred_price = pd.read_csv('./data/pred_price.csv')
|
|
pred_price['collection'] = pred_price.collection.apply(lambda x: clean_name(x))
|
|
listings['collection'] = listings.collection.apply(lambda x: clean_name(x))
|
|
listings['block_timestamp'] = s_df.block_timestamp.max()
|
|
listings = listings[listings.collection.isin(pred_price.collection.unique())]
|
|
floor = s_df.sort_values('timestamp').groupby('collection').tail(1)[['collection','mn_20']]
|
|
tmp = merge(listings, pred_price, ensure=False)
|
|
tmp = tmp[tmp.price < tmp.pred_price]
|
|
tmp['timestamp'] = tmp.block_timestamp.astype(int)
|
|
tmp['days_ago'] = tmp.block_timestamp.apply(lambda x: (datetime.today() - x).days ).astype(int)
|
|
tmp = merge(tmp, floor)
|
|
|
|
n = round(len(s_df) / 5000)
|
|
n = max(1, min(3, n))
|
|
print('Supplement with {}x listings'.format(n))
|
|
# n = 1
|
|
for _ in range(n):
|
|
s_df = s_df.append(tmp[[ 'block_timestamp','timestamp','collection','token_id','price','mn_20' ]])
|
|
# tmp_1 = tmp[tmp.price <= 0.8 * tmp.pred_price]
|
|
# s_df = s_df.append(tmp_1[[ 'block_timestamp','timestamp','collection','token_id','price','mn_20' ]])
|
|
# tmp_2 = tmp[tmp.price <= 0.6 * tmp.pred_price]
|
|
# tmp_2 = s_df.append(tmp_2[[ 'block_timestamp','timestamp','collection','token_id','price','mn_20' ]])
|
|
|
|
|
|
###########################
|
|
# Calculate Floor #
|
|
###########################
|
|
# coefsdf = pd.DataFrame()
|
|
# salesdf = pd.DataFrame()
|
|
# attributes = pd.DataFrame()
|
|
# pred_price = pd.DataFrame()
|
|
# feature_values = pd.DataFrame()
|
|
coefsdf = pd.read_csv('./data/coefsdf.csv')
|
|
salesdf = pd.read_csv('./data/model_sales.csv')
|
|
attributes = pd.read_csv('./data/attributes.csv')
|
|
pred_price = pd.read_csv('./data/pred_price.csv')
|
|
feature_values = pd.read_csv('./data/feature_values.csv')
|
|
# non-binary in model: collection_rank, temperature, weight
|
|
# non-binary in model; exclude from rarity: pct, rank, score
|
|
# exclude from model: lucky_number, shower
|
|
# exclude from model and rarity %: meteor_id, attribute_count, cracking_date
|
|
ALL_NUMERIC_COLS = ['rank','score','pct']
|
|
ALL_NUMERIC_COLS = ['nft_rank','adj_nft_rank_0','adj_nft_rank_1','adj_nft_rank_2']
|
|
MODEL_EXCLUDE_COLS = {
|
|
# 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
|
|
'Levana Dragon Eggs': ['meteor_id','shower','lucky_number','cracking_date','attribute_count','rarity_score_rank','rarity_score','weight']
|
|
, 'Solana Monkey Business': ['Clothes_Diamond']
|
|
}
|
|
MODEL_INCLUDE_COLS = {
|
|
# 'Solana Monkey Business': ['std_Hat_Strawhat','std_Hat_Space Warrior Hair','std_Clothes_Diamond','std_Eyes_Solana Vipers','std_Eyes_Vipers','std_Hat_Sombrero','std_Eyes_3D Glasses','std_Hat_Cowboy Hat','std_Eyes_Laser Eyes','std_matching_cop','std_matching_white','std_matching_black']
|
|
'Solana Monkey Business': ['std_Hat_Space Warrior Hair','std_matching_cop','std_Hat_Cowboy Hat','std_Hat_Sombrero','std_Hat_Solana Backwards Cap','std_Eyes_Solana Vipers','std_Eyes_Laser Eyes','std_Type_Solana']
|
|
}
|
|
RARITY_EXCLUDE_COLS = {
|
|
# 'Levana Dragon Eggs': ['collection_rank','meteor_id','shower','lucky_number','cracking_date','attribute_count','weight','temperature']
|
|
'Levana Dragon Eggs': ['meteor_id','attribute_count','collection_rank','transformed_collection_rank','rarity_score','rarity_score_rank','collection_rank_group']
|
|
}
|
|
NUMERIC_COLS = {
|
|
'Levana Dragon Eggs': ['collection_rank','temperature','transformed_collection_rank']
|
|
}
|
|
ATT_EXCLUDE_COLS = {
|
|
'Levana Dragon Eggs': ['attribute_count','transformed_collection_rank','collection_rank_group']
|
|
}
|
|
|
|
collection = 'Aurory'
|
|
collection = 'Solana Monkey Business'
|
|
collection = 'LunaBulls'
|
|
# for collection in [ 'Solana Monkey Business' ]:
|
|
# for collection in [ 'Aurory' ]:
|
|
# for collection in [ 'Aurory','Solana Monkey Business' ]:
|
|
collections = list(s_df[['collection']].drop_duplicates().merge(m_df[['collection']].drop_duplicates()).collection.unique())
|
|
for collection in [ 'LunaBulls' ]:
|
|
for df in [ coefsdf, salesdf, attributes, pred_price, feature_values ]:
|
|
if 'collection' in df.columns:
|
|
df = df[df.collection != collection]
|
|
print('Working on collection {}'.format(collection))
|
|
sales = s_df[ s_df.collection == collection ]
|
|
metadata = m_df[ m_df.collection == collection ]
|
|
metadata.groupby(['feature_name','feature_value']).token_id.count().reset_index().to_csv('~/Downloads/tmp.csv', index=False)
|
|
metadata[metadata.token_id == '1']
|
|
metadata['feature_name'] = metadata.feature_name.apply(lambda x: x.strip() )
|
|
metadata[metadata.token_id == '1']
|
|
metadata[metadata.feature_name == 'rank']
|
|
metadata.feature_name.unique()
|
|
metadata[(metadata.token_id=='1') & (metadata.collection == 'Solana Monkey Business')]
|
|
|
|
# categorize columns
|
|
all_names = sorted(metadata.feature_name.unique())
|
|
model_exclude = MODEL_EXCLUDE_COLS[collection] if collection in MODEL_EXCLUDE_COLS.keys() else []
|
|
num_features = sorted((NUMERIC_COLS[collection] if collection in NUMERIC_COLS.keys() else []) + ALL_NUMERIC_COLS)
|
|
num_features = [ x for x in num_features if x in metadata.feature_name.unique() ]
|
|
num_metadata = metadata[metadata.feature_name.isin(num_features)]
|
|
num_metadata[num_metadata.feature_name == 'nft_rank']
|
|
cat_features = sorted([ x for x in all_names if not x in (model_exclude + num_features) ])
|
|
cat_metadata = metadata[metadata.feature_name.isin(cat_features)]
|
|
|
|
# create dummies for binary variables
|
|
num_metadata = num_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
|
|
num_metadata.columns = [ 'collection','token_id' ] + num_features
|
|
|
|
# create dummies for binary variables
|
|
cat_metadata = cat_metadata.pivot( ['collection','token_id'], ['feature_name'], ['feature_value'] ).reset_index()
|
|
cat_metadata.columns = [ 'collection','token_id' ] + cat_features
|
|
cat_metadata = calculate_percentages( cat_metadata, cat_features )
|
|
dummies = pd.get_dummies(cat_metadata[cat_features])
|
|
# dummies.head(1).to_csv('~/Downloads/tmp2.csv', index=False)
|
|
if collection == 'Solana Monkey Business':
|
|
dummies['matching_cop'] = ((dummies['Clothes_Cop Vest'] == 1) & (dummies['Hat_Cop Hat'] == 1)).astype(int)
|
|
dummies['matching_white'] = ((dummies['Clothes_Beige Smoking'] == 1) & ((dummies['Hat_White Fedora 1'] + dummies['Hat_White Fedora 2']) == 1)).astype(int)
|
|
dummies['matching_black'] = ((dummies['Clothes_Black Smoking'] == 1) & ((dummies['Hat_Black Fedora 1'] + dummies['Hat_Black Fedora 2'] + dummies['Hat_Black Top Hat']) == 1)).astype(int)
|
|
dummies['matching_top'] = ((dummies['matching_black'] == 1) | (dummies['matching_white']== 1)).astype(int)
|
|
# dummies['matching_green'] = ((dummies['Clothes_Green Smoking'] == 1) & ((dummies['Hat_Green Top Hat']) == 1)).astype(int)
|
|
# dummies['naked_1_att'] = ((dummies['Attribute Count_1'] == 1) & (dummies['Clothes_None'] == 1)).astype(int)
|
|
# dummies['naked_1_att_hat'] = ((dummies['Attribute Count_1'] == 1) & (dummies['Hat_None'] == 0)).astype(int)
|
|
dummies['fedora'] = (dummies['Hat_Black Fedora 1'] + dummies['Hat_Black Fedora 2'] + dummies['Hat_White Fedora 1'] + dummies['Hat_White Fedora 2'] + dummies['Hat_White Fedora 2'] >= 1 ).astype(int)
|
|
dummies['backwards_cap'] = (dummies['Hat_Black Backwards Cap'] + dummies['Hat_Blue Backwards Cap'] + dummies['Hat_Green Backwards Cap'] + dummies['Hat_Orange Backwards Cap'] + dummies['Hat_Purple Backwards Cap'] + dummies['Hat_Solana Backwards Cap'] >= 1 ).astype(int)
|
|
del dummies['matching_white']
|
|
del dummies['matching_black']
|
|
cat_metadata = pd.concat([ cat_metadata.reset_index(drop=True), dummies.reset_index(drop=True) ], axis=1)
|
|
del cat_metadata['pct']
|
|
|
|
for c in model_exclude:
|
|
if c in dummies.columns:
|
|
del dummies[c]
|
|
pred_cols = num_features + list(dummies.columns)
|
|
pred_cols = [ c for c in pred_cols if not c in model_exclude ]
|
|
|
|
# create training df
|
|
sales['token_id'] = sales.token_id.astype(str)
|
|
num_metadata['token_id'] = num_metadata.token_id.astype(str)
|
|
df = merge(sales, num_metadata, ['collection','token_id'], ensure=False)
|
|
df = merge(df, cat_metadata, ['collection','token_id'], ensure=False)
|
|
for c in num_features:
|
|
df[c] = df[c].apply(lambda x: just_float(x))
|
|
|
|
|
|
#################################
|
|
# Create Test DataFrame #
|
|
#################################
|
|
# test = merge(num_metadata, cat_metadata, ['collection','token_id'], ensure=True, how='left')
|
|
ensure = not collection in ['Aurory']
|
|
# test = merge(num_metadata, cat_metadata, ['collection','token_id'], ensure=ensure)
|
|
test = merge(num_metadata, cat_metadata, ['collection','token_id'], ensure=False)
|
|
for c in num_features:
|
|
test[c] = test[c].apply(lambda x: just_float(x) )
|
|
tail = df.sort_values('timestamp').tail(1)
|
|
test.loc[ test.token_id == '903', 'nft_rank' ] = 18
|
|
for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
|
|
if c in tail.columns:
|
|
test[c] = tail[c].values[0]
|
|
|
|
for tmp in [df, test]:
|
|
for i in [100, 250, 1000]:
|
|
tmp['is_top_{}'.format(i)] = (tmp.nft_rank <= i).astype(int)
|
|
pred_cols += [ 'is_top_100','is_top_250','is_top_1000' ]
|
|
df.sort_values('price', ascending=0)[['price']].head(20)
|
|
# df.groupby(['rarity','weight']).price.mean()
|
|
|
|
# create target cols
|
|
target_col = 'adj_price'
|
|
df[target_col] = df.apply(lambda x: max(0.7 * (x['mn_20'] - 0.2), x['price']), 1 )
|
|
df['mn_20'] = df.mn_20 * 1.01
|
|
df = df[df[target_col].notnull()]
|
|
df['log_price'] = df[target_col].apply(lambda x: np.log(x) )
|
|
df['rel_price_0'] = df[target_col] - df.mn_20
|
|
df['rel_price_1'] = df[target_col] / df.mn_20
|
|
df = df[df.mn_20 > 0]
|
|
df['log_mn_20'] = np.log(df.mn_20)
|
|
print('Training on {} sales'.format(len(df)))
|
|
df = standardize_df(df, pred_cols)
|
|
test = standardize_df(test, pred_cols, df)
|
|
|
|
std_pred_cols_0 = [ 'std_{}'.format(c) for c in pred_cols ]
|
|
std_pred_cols = [ 'std_{}'.format(c) for c in pred_cols ]
|
|
|
|
#########################
|
|
# Run the Model #
|
|
#########################
|
|
tmp = df[std_pred_cols].count().reset_index()
|
|
tmp.columns = ['a','b']
|
|
tmp.sort_values('b').head(20)
|
|
rem = list(tmp[tmp.b==0].a.values)
|
|
std_pred_cols = [ c for c in std_pred_cols if not c in rem ]
|
|
if collection == 'Levana Dragon Eggs':
|
|
std_pred_cols = [ 'std_essence_Dark','std_collection_rank_group_0','std_rarity_Legendary','std_rarity_Rare','std_rarity_Ancient','std_collection_rank','std_transformed_collection_rank' ]
|
|
mn = df.timestamp.min()
|
|
mx = df.timestamp.max()
|
|
df['wt'] = df.timestamp.apply(lambda x: 3.0 ** ((x - mn) / (mx - mn)) )
|
|
df.loc[ (df.collection == 'Aurory') & (df.block_timestamp <= '2021-09-05'), 'wt' ] = 0.05
|
|
if collection == 'Levana Dragon Eggs':
|
|
df['wt'] = 1
|
|
# df['wt'] = df.price.apply(lambda x: 1.0 / (x ** 0.9) )
|
|
# df.sort_values('price', ascending=0)[['price','wt']].head(20)
|
|
# std_pred_cols = [ 'std_Hat_Crown','std_adj_nft_rank_0','std_Hat_None','std_Eyes_None','std_Clothes_None','std_Attribute Count_4','std_Mouth_None','std_adj_nft_rank_1','std_Type_Dark','std_Ears_None','std_Background_Light purple','std_Hat_Black Fedora 2','std_Hat_White Fedora 2','std_Attribute Count_0','std_Type_Skeleton','std_Attribute Count_2','std_Attribute Count_1','std_Hat_Protagonist Black Hat','std_Clothes_Sailor Vest','std_Mouth_Pipe','std_Hat_Protagonist White Hat','std_Clothes_Pirate Vest','std_Hat_Roman Helmet','std_Type_Solana','std_Clothes_Beige Smoking','std_Hat_Military Helmet','std_Hat_White Fedora 1','std_naked_1_att','std_Type_Zombie','std_Clothes_Roman Armor','std_Eyes_3D Glasses','std_Clothes_Orange Kimono','std_Hat_Green Punk Hair','std_Hat_Sombrero','std_Clothes_Military Vest','std_Hat_Space Warrior Hair','std_Hat_Blue Punk Hair','std_Clothes_Orange Jacket','std_Ears_Earing Silver','std_Eyes_Laser Eyes','std_Eyes_Vipers','std_Type_Alien','std_Type_Red','std_Hat_Admiral Hat' ]
|
|
cur_std_pred_cols = [ 'std_adj_nft_rank_0','std_Hat_Crown','std_adj_nft_rank_1','std_Type_Skeleton','std_Type_Alien','std_Clothes_None','std_Eyes_Vipers','std_Hat_Space Warrior Hair','std_Type_Zombie','std_Clothes_Pirate Vest','std_Clothes_Orange Kimono','std_Eyes_Laser Eyes','std_Type_Solana','std_Hat_Ninja Bandana','std_Hat_Solana Backwards Cap','std_Eyes_Solana Vipers','std_Attribute Count_0','std_Attribute Count_1','std_Attribute Count_2','std_Attribute Count_3','std_Attribute Count_5','std_Hat_Strawhat','std_Hat_Admiral Hat','std_matching_top','std_Hat_Sombrero','std_matching_cop','std_Hat_Cowboy Hat','std_Hat_None' ]
|
|
cur_std_pred_cols = deepcopy(std_pred_cols)
|
|
g = df[std_pred_cols].sum().reset_index()
|
|
g.columns = [ 'col','cnt' ]
|
|
g = g.sort_values('cnt')
|
|
g.head(20)
|
|
if collection == 'Solana Monkey Busines':
|
|
df.loc[ df.token_id == '903', 'nft_rank' ] = 18
|
|
df[df.token_id=='903']
|
|
df[df.token_id==903]
|
|
df = df.reset_index(drop=True)
|
|
X = df[cur_std_pred_cols].values
|
|
y_0 = df.rel_price_0.values
|
|
y_1 = df.rel_price_1.values
|
|
|
|
# CUR_FLOOR = df.sort_values('block_timestamp', ascending=0).mn_20.values[0]
|
|
CUR_FLOOR = listings[(listings.collection == collection) & (listings.price.notnull())].price.min()
|
|
print('CUR_FLOOR = {}'.format(CUR_FLOOR))
|
|
|
|
for target_col in [ 'rel_price_0', 'rel_price_1' ]:
|
|
it = target_col[-1]
|
|
y_val = df[target_col].values
|
|
print('target_col = {}'.format(target_col))
|
|
mn = -1
|
|
cols = [ 'std_nft_rank','std_adj_nft_rank_0','std_adj_nft_rank_1','std_adj_nft_rank_2' ]
|
|
clf = Ridge(alpha = 1)
|
|
# while mn < 0 and len(cols):
|
|
# clf.fit(df[cols].values, y_val, df.wt.values)
|
|
# coefs = get_coefs(cols, clf.coef_)
|
|
# mn = min(coefs.val) if len(coefs) else 0
|
|
# if mn < 0:
|
|
# cols.remove(coefs.col.values[-1])
|
|
|
|
col = 'rarity_value_'+it
|
|
model = 'ridge'
|
|
df[col] = 0
|
|
test[col] = 0
|
|
# df, bst_p, bst_r = ku.get_bst_params( model, df, df[cols].values, y_val, target_col, col, verbose = True, wt_col='wt' )
|
|
# test = ku.apply_model( model, bst_p, df, test, cols, target_col, col)
|
|
|
|
# df['rarity_value_'+it] = clf.predict(df[cols].values)
|
|
rar_adj_target_col = 'rar_adj_'+target_col
|
|
df[rar_adj_target_col] = df[target_col] - df['rarity_value_'+it]
|
|
# test[rar_adj_target_col] = test[target_col] - test['rarity_value_'+it]
|
|
y_val_rar_adj = df[rar_adj_target_col].values
|
|
models = ['las','ridge'] if target_col == 'rel_price_1' else ['las','ridge','rfr']
|
|
for model in models:
|
|
cur_std_pred_cols = std_pred_cols
|
|
print(model)
|
|
y = y_val_rar_adj if model in ['rfr'] else y_val
|
|
col = 'y_pred_{}_{}'.format(model, it)
|
|
df, bst_p, bst_r = ku.get_bst_params( model, df, X, y, target_col, col, verbose = True, wt_col='wt' )
|
|
|
|
# if model == 'ridge':
|
|
# while len(cur_std_pred_cols) > 50:
|
|
# coefs = get_coefs(cur_std_pred_cols, clf.coef_)
|
|
# cur_std_pred_cols.remove(coefs.col.values[-1])
|
|
# new_X = df[cur_std_pred_cols].values
|
|
# clf = ku.get_model(model, bst_p)
|
|
# clf.fit(new_X, y)
|
|
# # coefs.to_csv('./data/coefs/{}_{}_{}.csv'.format(collection, model, it))
|
|
# new_X = df[cur_std_pred_cols].values
|
|
# df, bst_p, bst_r = ku.get_bst_params( model, df, new_X, y, target_col, col, verbose = True, wt_col='wt' )
|
|
|
|
if model in ['las','ridge']:
|
|
clf = ku.get_model(model, bst_p)
|
|
clf.fit(X, y)
|
|
coefs = get_coefs(cur_std_pred_cols, clf.coef_)
|
|
mn = coefs.val.min()
|
|
while mn < 0:
|
|
cur_std_pred_cols = [ c for c in coefs[coefs.val >= 0 ].col.unique() ]
|
|
X_new = df[cur_std_pred_cols].values
|
|
clf.fit(X_new, y)
|
|
# df, bst_p, bst_r = ku.get_bst_params( model, df, df[cur_std_pred_cols].values, y, target_col, col, verbose = True, wt_col='wt' )
|
|
coefs = get_coefs(cur_std_pred_cols, clf.coef_)
|
|
mn = coefs.val.min()
|
|
coefs.to_csv('./data/coefs/{}_{}_{}.csv'.format(collection, model, it), index=False)
|
|
test = ku.apply_model( model, bst_p, df, test, cur_std_pred_cols, target_col, col)
|
|
if model in ['rfr']:
|
|
df[col] = df[col] + df['rarity_value_'+it]
|
|
test[col] = test[col] + test['rarity_value_'+it]
|
|
|
|
mn = -1
|
|
cols = [ c for c in df.columns if c[:7] == 'y_pred_' and c[-1] == it ]
|
|
clf = LinearRegression()
|
|
df[cols].mean()
|
|
df[cols].median()
|
|
test[cols].mean()
|
|
test[cols].median()
|
|
while mn < 0 and len(cols):
|
|
clf.fit(df[cols].values, df[target_col].values)
|
|
coefs = get_coefs(cols, clf.coef_)
|
|
mn = min(coefs.val) if len(coefs) else 0
|
|
if mn < 0:
|
|
cols.remove(coefs.col.values[-1])
|
|
else:
|
|
print(coefs)
|
|
if it == '0':
|
|
df['pred_lin'] = clf.predict(df[cols].values) + df.mn_20
|
|
test['pred_lin'] = clf.predict(test[cols].values) + CUR_FLOOR
|
|
# df['pred_lin'] = df.pred_lin.apply(lambda x: max(0, x)) + df.mn_20
|
|
else:
|
|
df['pred_log'] = clf.predict(df[cols].values)
|
|
df['pred_log'] = df.pred_log.apply(lambda x: max(1, x)) * df.mn_20
|
|
test['pred_log'] = clf.predict(test[cols].values)
|
|
test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * CUR_FLOOR
|
|
|
|
clf = LinearRegression(fit_intercept=False)
|
|
target_col = 'adj_price'
|
|
clf.fit( df[['pred_lin','pred_log']].values, df[target_col].values, df.wt.values )
|
|
clf.score( df[['pred_lin','pred_log']].values, df[target_col].values, df.wt.values )
|
|
df[['pred_lin','pred_log',target_col]].mean()
|
|
df[['pred_lin','pred_log',target_col]].median()
|
|
test[['pred_lin','pred_log']].mean()
|
|
test[['pred_lin','pred_log']].median()
|
|
|
|
print('Price = {} * lin + {} * log'.format( round(clf.coef_[0], 2), round(clf.coef_[1], 2) ))
|
|
tmp = pd.DataFrame([[collection, clf.coef_[0], clf.coef_[1], CUR_FLOOR]], columns=['collection','lin_coef','log_coef','floor_price'])
|
|
if clf.coef_[0] < 0:
|
|
print('Only using log')
|
|
df['pred'] = df.pred_log
|
|
test['pred'] = test.pred_log
|
|
tmp['lin_coef'] = 0
|
|
tmp['log_coef'] = 1
|
|
elif clf.coef_[1] < 0:
|
|
print('Only using lin')
|
|
df['pred'] = df.pred_lin
|
|
test['pred'] = test.pred_lin
|
|
tmp['lin_coef'] = 1
|
|
tmp['log_coef'] = 0
|
|
else:
|
|
print('Only using BOTH!')
|
|
df['pred'] = clf.predict( df[['pred_lin','pred_log']].values )
|
|
test['pred'] = clf.predict( test[['pred_lin','pred_log']].values )
|
|
coefsdf = coefsdf.append(tmp)
|
|
df['err'] = (df.pred / df[target_col]).apply(lambda x: abs(x-1) )
|
|
|
|
# print out some summary stats
|
|
df['err'] = df[target_col] - df.pred
|
|
df['q'] = (df.pred.rank() ** 1.5 * .2) / len(df)
|
|
df['q'] = df.q.apply(lambda x: int(round(x)) )
|
|
df['pct_err'] = (df[target_col] / df.pred) - 1
|
|
pe_mu = df.pct_err.mean()
|
|
pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) & (df.days_ago<=50) ].pct_err.std()
|
|
if pe_sd != pe_sd:
|
|
pe_sd = df[ (df.pct_err > -.9) & (df.pct_err < 0.9) ].pct_err.std()
|
|
df['pred_price'] = df.pred#.apply(lambda x: x*(1+pe_mu) )
|
|
df['pred_sd'] = df.pred * pe_sd
|
|
# print(df.groupby('q')[['err','pred',target_col]].mean())
|
|
print(df[df.wt >= df.wt.median()].groupby('q')[['err','pred',target_col]].mean())
|
|
print(df.groupby('q')[['err','pred',target_col]].mean())
|
|
# df.err.mean()
|
|
# df[df.weight >= 3.5].err.mean()
|
|
df[df.pred < 200].err.mean()
|
|
df['collection'] = collection
|
|
print('Avg err last 100: {}'.format(round(df.sort_values('block_timestamp').head(100).err.mean(), 2)))
|
|
salesdf = salesdf.append( df.merge(s_df[s_df.sim == 0][['collection','token_id','block_timestamp','price']] )[[ 'collection','token_id','block_timestamp','price','pred','mn_20','nft_rank' ]].sort_values('block_timestamp', ascending=0) )
|
|
|
|
############################################################
|
|
# Create Predictions for Each NFT in The Collection #
|
|
############################################################
|
|
# test = merge(num_metadata, cat_metadata, ['collection','token_id'])
|
|
# for c in num_features:
|
|
# test[c] = test[c].apply(lambda x: just_float(x) )
|
|
# tail = df.sort_values('timestamp').tail(1)
|
|
# test.loc[ test.token_id == '903', 'nft_rank' ] = 18
|
|
# for c in [ 'std_timestamp','mn_20','log_mn_20' ]:
|
|
# if c in tail.columns:
|
|
# test[c] = tail[c].values[0]
|
|
# test = standardize_df(test, pred_cols, df)
|
|
|
|
# test['pred_lin'] = clf_lin.predict(test[lin_std_pred_cols].values)
|
|
# test['pred_lin'] = test.pred_lin.apply(lambda x: max(0, x) + l)
|
|
# test['pred_log'] = clf_log.predict(test[log_std_pred_cols].values)
|
|
# test['pred_log'] = test.pred_log.apply(lambda x: max(1, x)) * l
|
|
|
|
# test['pred_price'] = test.pred.apply(lambda x: x if x < 400 else (x-400)**0.96 + 400 )
|
|
def f(p):
|
|
c = CUR_FLOOR * 2.5
|
|
return( p if p <= c else c+((p-c) ** 0.95) )
|
|
test['pred_price'] = test.pred.apply(lambda x: f(x) )
|
|
len(test[test.pred <= CUR_FLOOR * 1.01])
|
|
len(test[test.pred <= CUR_FLOOR * 1.02])
|
|
if not check_exclude:
|
|
test['pred_price'] = test.pred_price.apply(lambda x: (x*0.985) )
|
|
len(test[test.pred_price <= CUR_FLOOR])
|
|
test['pred_sd'] = test.pred_price * pe_sd
|
|
test = test.sort_values(['collection','token_id'])
|
|
test['rk'] = test.pred_price.rank(ascending=0, method='first')
|
|
test['collection'] = collection
|
|
pred_price = pred_price.append( test[[ 'collection','token_id','nft_rank','rk','pred_price','pred_sd' ]].sort_values('pred_price') ).drop_duplicates(subset=['collection','token_id'], keep='last')
|
|
|
|
|
|
imp = []
|
|
for c in dummies.columns:
|
|
md = test[test[c] == 1].pred_price.median()
|
|
md_0 = test.pred_price.quantile(0.475)
|
|
imp += [[ collection, c, md_0, md ]]
|
|
# imp = pd.DataFrame(imp, columns=['collection','feature_name',''])
|
|
imp = pd.DataFrame(imp, columns=['collection','col','col_md','md']).sort_values('md', ascending=0)
|
|
imp['pct_vs_baseline'] = ((imp.md / imp.col_md) - 1).apply(lambda x: max(0, x))
|
|
imp['feature_name'] = imp.col.apply(lambda x: re.split('_', x)[0] )
|
|
imp['feature_value'] = imp.col.apply(lambda x: re.split('_', x)[1] if '_' in x else None )
|
|
feature_values = feature_values.append(imp[['collection','feature_name','feature_value','pct_vs_baseline']])
|
|
|
|
cols = metadata.feature_name.unique()
|
|
cols = [ x for x in cols if not x in (ATT_EXCLUDE_COLS[collection] if collection in ATT_EXCLUDE_COLS.keys() else []) + ALL_NUMERIC_COLS ]
|
|
exclude = RARITY_EXCLUDE_COLS[collection] if collection in RARITY_EXCLUDE_COLS.keys() else []
|
|
for c in cols:
|
|
cur = metadata[metadata.feature_name == c][['collection','token_id','feature_name','feature_value']]
|
|
l = len(cur.token_id.unique())
|
|
if c in exclude:
|
|
cur['rarity'] = None
|
|
else:
|
|
g = cur.groupby('feature_value').token_id.count().reset_index()
|
|
g['rarity'] = g.token_id / l
|
|
cur = merge(cur, g[['feature_value','rarity']])
|
|
attributes = attributes.append(cur)
|
|
|
|
attributes['feature_name'] = attributes.feature_name.apply(lambda x: re.sub('_', ' ', x).title().strip() )
|
|
attributes['feature_value'] = attributes.feature_value.apply(lambda x: str(x).strip() )
|
|
sorted(attributes['feature_name'].unique())
|
|
if len(feature_values):
|
|
feature_values['feature_name'] = feature_values.feature_name.apply(lambda x: re.sub('_', ' ', x).title() )
|
|
# feature_values = pd.read_csv('./data/feature_values.csv')
|
|
feature_values = feature_values.merge(attributes[['collection','feature_name']].drop_duplicates())
|
|
# n = feature_values[['collection', 'feature_name']].drop_duplicates().groupby(['collection']).feature_name.count().reset_index().rename(columns={'feature_name': 'n'})
|
|
# feature_values = feature_values.merge(n)
|
|
# feature_values['pct_vs_baseline'] = feature_values.pct_vs_baseline / feature_values.n
|
|
# del feature_values['n']
|
|
feature_values[ (feature_values.collection == 'Solana Monkey Business') & (feature_values.feature_name == 'Clothes') ]
|
|
feature_values[ (feature_values.collection == 'Solana Monkey Business') & (feature_values.feature_name == 'Clothes') & (feature_values.feature_value == 'Poncho') ]
|
|
attributes[ (attributes.collection == 'Solana Monkey Business') & (attributes.feature_name == 'Clothes') & (attributes.feature_value == 'Poncho') & (attributes.token_id == '1') ]
|
|
attributes[ (attributes.collection == 'Solana Monkey Business') & (attributes.feature_name == 'Clothes') & (attributes.feature_value == 'Poncho') & (attributes.token_id == 1) ]
|
|
|
|
coefsdf.to_csv('./data/coefsdf.csv', index=False)
|
|
salesdf.to_csv('./data/model_sales.csv', index=False)
|
|
old = pd.read_csv('./data/pred_price copy.csv')
|
|
old['token_id'] = old.token_id.astype(str)
|
|
old = pred_price.merge(old, on=['collection','token_id'])
|
|
old['ratio'] = old.pred_price_x / old.pred_price_y
|
|
old = old.sort_values('ratio')
|
|
old.columns = [ 'collection', 'token_id', 'nft_rank', 'rk_new', 'pred_price_new', 'pred_sd_x', 'rank', 'rk_old', 'pred_price_old', 'pred_sd_y', 'clean_token_id', 'ratio' ]
|
|
m = m_df[(m_df.collection.isin(pred_price.collection.unique())) & (-(m_df.feature_name.isin(['nft_rank','adj_nft_rank_0','adj_nft_rank_1','adj_nft_rank_2'])))]
|
|
m_p = m.pivot(['collection','token_id'], ['feature_name'], ['feature_value']).reset_index()
|
|
m_p.columns = [ 'collection','token_id' ] + sorted(m.feature_name.unique())
|
|
m_p.head()
|
|
old = old.merge(m_p, on=['collection','token_id'])
|
|
old = old[[ 'token_id', 'nft_rank', 'rk_old', 'rk_new', 'pred_price_old', 'pred_price_new', 'ratio' ] + [c for c in m_p.columns if not c in ['token_id','collection']]]
|
|
old.to_csv('~/Downloads/tmp1.csv', index=False)
|
|
pred_price.head()
|
|
old[old.token_id == '4857']
|
|
old.head()
|
|
old.tail()
|
|
|
|
# nft_rank = m_df[m_df.feature_name=='nft_rank'][['collection','token_id','feature_value']].rename(columns={'feature_value': 'nft_rank'})
|
|
# nft_rank['token_id'] = nft_rank.token_id.astype(str)
|
|
# pred_price['token_id'] = pred_price.token_id.astype(str)
|
|
# pred_price = pred_price.merge(nft_rank, how='left', on=['collection','token_id'])
|
|
pred_price.to_csv('./data/pred_price.csv', index=False)
|
|
# pred_price = pd.read_csv('./data/pred_price.csv')
|
|
pred_price.groupby('collection')[['pred_price']].min()
|
|
attributes.to_csv('./data/attributes.csv', index=False)
|
|
attributes = pd.read_csv('./data/attributes.csv')
|
|
attributes[attributes.rarity.isnull()]
|
|
feature_values.to_csv('./data/feature_values.csv', index=False)
|
|
|
|
# metadata = pd.read_csv('./data/metadata.csv')
|
|
# metadata['collection'] = metadata.collection.apply(lambda x: clean_name(x))
|
|
# metadata['token_id'] = metadata.token_id.astype(str)
|
|
# metadata.head()
|
|
# nft_rank = pred_price[[ 'collection','token_id','nft_rank' ]].rename(columns={'nft_rank':'feature_value'})
|
|
# nft_rank['feature_name'] = 'nft_rank'
|
|
# metadata = metadata[metadata.feature_name != 'nft_rank']
|
|
# nft_rank = merge(nft_rank, metadata[['collection','chain']].fillna('Solana').drop_duplicates())
|
|
# metadata = metadata.append(nft_rank)
|
|
# metadata.to_csv('./data/metadata.csv', index=False)
|
|
|
|
|
|
feature_values.to_csv('./data/feature_values.csv', index=False)
|
|
|
|
if True or check_exclude:
|
|
exclude = pd.read_csv('./data/exclude.csv')
|
|
salesdf['rat'] = salesdf.price / salesdf.pred
|
|
salesdf['dff'] = salesdf.price - salesdf.pred
|
|
salesdf['exclude_1'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2.5)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
|
|
salesdf['rat'] = salesdf.pred / salesdf.price
|
|
salesdf['dff'] = salesdf.pred - salesdf.price
|
|
salesdf['exclude_2'] = (((salesdf.dff >= 20) & (salesdf.rat > 4)) | ((salesdf.dff >= 40) & (salesdf.rat > 3)) | ((salesdf.dff >= 60) & (salesdf.rat > 2.5)) | ((salesdf.dff >= 80) & (salesdf.rat > 2))).astype(int)
|
|
salesdf['exclude'] = (salesdf.exclude_1 + salesdf.exclude_2).apply(lambda x: int(x>0))
|
|
print(salesdf.exclude_1.mean())
|
|
print(salesdf.exclude_2.mean())
|
|
print(salesdf.exclude.mean())
|
|
salesdf[salesdf.token_id == '2239'][['collection','price','exclude']]
|
|
exclude = exclude.append(salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]])
|
|
# salesdf[salesdf.exclude == 1][[ 'collection','token_id','price','exclude' ]].to_csv('./data/exclude.csv', index=False)
|
|
exclude.to_csv('./data/exclude.csv', index=False)
|
|
|
|
# train_model(True, False)
|
|
# train_model(False, False)
|
|
train_model(False, True)
|
|
|