Beta allday espn id view (#255)

This commit is contained in:
Jack Forgash 2024-01-05 12:57:55 -07:00 committed by GitHub
parent 074c11c55a
commit 7c87cd4225
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 243 additions and 0 deletions

View File

@ -0,0 +1,86 @@
{{ config(
materialized = 'view'
) }}
WITH espn AS (
SELECT
CASE
-- slight name mismatches
WHEN A.full_name = 'Patrick Mahomes' THEN 'Patrick Mahomes II'
WHEN A.full_name = 'Joshua Palmer' THEN 'Josh Palmer'
WHEN A.full_name = 'Gabe Davis' THEN 'Gabriel Davis'
ELSE A.full_name
END AS player,
t.display_name AS team,
A.id AS espn_player_id,
SPLIT(
SPLIT(TRY_PARSE_JSON(A.team) :"$ref", 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2023/teams/') [1],
'?'
) [0] :: INT AS espn_team_id,
try_parse_json(status):type::string = 'active' as is_active -- note, this may depend on time of data pull from ESPN. Includes IR status. Update as needed.
FROM
{{ ref('beta__dim_nfl_athletes') }} A
LEFT JOIN {{ ref('beta__dim_nfl_teams') }}
t
ON espn_team_id = t.id
WHERE
SPLIT(
SPLIT(TRY_PARSE_JSON(POSITION) :parent :"$ref", 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/') [1],
'?') [0] :: INT = 70 -- offense only
),
allday AS (
SELECT
nft_id,
nflallday_id,
serial_number,
moment_tier,
CASE
-- Some moments use DJ others D.J.
WHEN player = 'D.J. Moore' THEN 'DJ Moore'
ELSE player
END AS player,
team,
season,
play_type,
moment_stats_full :metadata :playerPosition :: STRING AS POSITION
FROM
{{ ref('nft__dim_allday_metadata') }}
ad
WHERE
classification = 'PLAYER_GAME'
AND season >= 2018
AND POSITION IN (
'QB',
'WR',
'RB',
'TE'
)
),
FINAL AS (
SELECT
nft_id,
nflallday_id,
serial_number,
moment_tier,
ad.player,
ad.team,
season,
play_type,
POSITION,
espn_player_id,
espn_team_id,
is_active
FROM
allday ad
LEFT JOIN espn
ON TRIM(LOWER(
ad.player
)) = TRIM(LOWER(
espn.player
))
)
SELECT
*
FROM
FINAL

16
python/espn/README.md Normal file
View File

@ -0,0 +1,16 @@
# Log ESPN Player IDs
Python script to query data from the ESPN API and store it in Snowflake for prod use. Presently, the player endpoint is what we will likely need to run again before the season starts.
The teams endpoint can be set by uncommenting lines 115-122 and commenting out 124-132. The target table is presently set in-line in the script.
## .env
Script will load the following config variables to create a Snowflake connection. Write access to prod db is required, as the data ingested from the ESPN API is uploaded to a bronze table.
```
USERNAME=<name>@flipsidecrypto.com
ROLE=<PROD_ROLE>
ACCOUNT=<SF_ACCT>
WAREHOUSE=<>
DATABASE=<>
SCHEMA=<>
```

141
python/espn/get_espn.py Normal file
View File

@ -0,0 +1,141 @@
import snowflake
import pandas as pd
import requests as res
import re
import json
import os
from datetime import datetime, timezone
from dotenv import load_dotenv
from snowflake.connector.pandas_tools import write_pandas
def upload_to_snowflake(conn, data, table_name):
# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)
df.columns = [camel_to_snake(col) for col in df.columns]
# Add a TIMESTAMP column
df['_INSERTED_TIMESTAMP'] = datetime.now(timezone.utc)
df.rename(columns={'$REF': 'REF'}, inplace=True)
# Map pandas data types to Snowflake data types
dtype_mapping = {
'object': 'STRING',
'int64': 'NUMBER',
'float64': 'FLOAT',
'datetime64[ns]': 'FLOAT', # loaded as unix timestamp. Can cast in SF using :: TIMESTAMP_NTZ
'bool': 'BOOLEAN',
'dict': 'VARIANT',
# Add more mappings if needed
}
# Generate the CREATE TABLE statement
# TODO should change this to IF NOT EXISTS ? Or leave as-is because future pulls overwrite instead of append
create_table_stmt = 'CREATE OR REPLACE TABLE {} (\n'.format(table_name)
for col, dtype in df.dtypes.items():
snowflake_dtype = dtype_mapping.get(str(dtype), 'STRING')
create_table_stmt += ' "{}" {},\n'.format(col, snowflake_dtype)
create_table_stmt = create_table_stmt.rstrip(',\n') + '\n);'
# Execute the CREATE TABLE statement
conn.cursor().execute(create_table_stmt)
# Upload the DataFrame to Snowflake
write_pandas(conn, df, table_name)
def get_espn(base_url, params):
# paginate through endpoint until no items returned to get urls for each entity
response = []
page = 1
while True:
r = res.get(f"{base_url}?page={page}", params=params)
if len(r.json()['items']) == 0:
break
response.extend(r.json()['items'])
page += 1
# log completion of url request
print(f"Retrieved {len(response)} items from {base_url}")
data = []
errors = []
for i, url in enumerate(response):
try:
r = res.get(url['$ref'])
# if success, append to data
if r.status_code == 200:
data.append(r.json())
else:
print(f"Error getting {url['$ref']}: {r.status_code}")
# append error to list of errors
errors.append(url['$ref'])
# log every 100 successful requests
if i % 100 == 0:
print(f"Retrieved {i} entities")
except Exception as e:
print(f"Error getting {url['$ref']}: {e}")
# append error to list of errors
errors.append(url['$ref'])
# if any errors, write to file
if len(errors) > 0:
with open('errors.txt', 'w') as f:
f.write('\n'.join(errors))
return data
def camel_to_snake(name):
# Convert camel case to snake case for column naming
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).upper()
def main():
load_dotenv()
# Establish connection to Snowflake
conn = snowflake.connector.connect(
user=os.getenv('USERNAME'),
authenticator= "externalbrowser",
account=os.getenv('ACCOUNT'),
warehouse=os.getenv('WAREHOUSE'),
database=os.getenv('DATABASE'),
schema=os.getenv('SCHEMA'),
role=os.getenv('ROLE'),
)
# teams_url = "https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2023/teams"
# params = {
# "limit": 50
# }
# table_name = "ESPN_NFL_TEAMS"
# data = get_espn(teams_url, params)
# upload_to_snowflake(conn, data, table_name)
athletes_url = "https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/athletes"
params = {
"limit": 1000,
"active": "true"
}
table_name = "ESPN_NFL_ATHLETES"
data = get_espn(athletes_url, params)
upload_to_snowflake(conn, data, table_name)
# Close the connection
conn.close()
return True
if __name__ == '__main__':
main()