From 7c87cd42250599b181d80a120ee98f3118fbdc96 Mon Sep 17 00:00:00 2001 From: Jack Forgash <58153492+forgxyz@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:57:55 -0700 Subject: [PATCH] Beta allday espn id view (#255) --- .../gold/beta/beta__ez_moment_player_ids.sql | 86 +++++++++++ python/espn/README.md | 16 ++ python/espn/get_espn.py | 141 ++++++++++++++++++ 3 files changed, 243 insertions(+) create mode 100644 models/gold/beta/beta__ez_moment_player_ids.sql create mode 100644 python/espn/README.md create mode 100644 python/espn/get_espn.py diff --git a/models/gold/beta/beta__ez_moment_player_ids.sql b/models/gold/beta/beta__ez_moment_player_ids.sql new file mode 100644 index 0000000..0fd3c13 --- /dev/null +++ b/models/gold/beta/beta__ez_moment_player_ids.sql @@ -0,0 +1,86 @@ +{{ config( + materialized = 'view' +) }} + +WITH espn AS ( + + SELECT + CASE + -- slight name mismatches + WHEN A.full_name = 'Patrick Mahomes' THEN 'Patrick Mahomes II' + WHEN A.full_name = 'Joshua Palmer' THEN 'Josh Palmer' + WHEN A.full_name = 'Gabe Davis' THEN 'Gabriel Davis' + ELSE A.full_name + END AS player, + t.display_name AS team, + A.id AS espn_player_id, + SPLIT( + SPLIT(TRY_PARSE_JSON(A.team) :"$ref", 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2023/teams/') [1], + '?' + ) [0] :: INT AS espn_team_id, + try_parse_json(status):type::string = 'active' as is_active -- note, this may depend on time of data pull from ESPN. Includes IR status. Update as needed. + FROM + {{ ref('beta__dim_nfl_athletes') }} A + LEFT JOIN {{ ref('beta__dim_nfl_teams') }} + t + ON espn_team_id = t.id + WHERE + SPLIT( + SPLIT(TRY_PARSE_JSON(POSITION) :parent :"$ref", 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/') [1], + '?') [0] :: INT = 70 -- offense only + ), + allday AS ( + SELECT + nft_id, + nflallday_id, + serial_number, + moment_tier, + CASE + -- Some moments use DJ others D.J. + WHEN player = 'D.J. Moore' THEN 'DJ Moore' + ELSE player + END AS player, + team, + season, + play_type, + moment_stats_full :metadata :playerPosition :: STRING AS POSITION + FROM + {{ ref('nft__dim_allday_metadata') }} + ad + WHERE + classification = 'PLAYER_GAME' + AND season >= 2018 + AND POSITION IN ( + 'QB', + 'WR', + 'RB', + 'TE' + ) + ), + FINAL AS ( + SELECT + nft_id, + nflallday_id, + serial_number, + moment_tier, + ad.player, + ad.team, + season, + play_type, + POSITION, + espn_player_id, + espn_team_id, + is_active + FROM + allday ad + LEFT JOIN espn + ON TRIM(LOWER( + ad.player + )) = TRIM(LOWER( + espn.player + )) + ) + SELECT + * + FROM + FINAL diff --git a/python/espn/README.md b/python/espn/README.md new file mode 100644 index 0000000..2eb9b7a --- /dev/null +++ b/python/espn/README.md @@ -0,0 +1,16 @@ +# Log ESPN Player IDs +Python script to query data from the ESPN API and store it in Snowflake for prod use. Presently, the player endpoint is what we will likely need to run again before the season starts. + +The teams endpoint can be set by uncommenting lines 115-122 and commenting out 124-132. The target table is presently set in-line in the script. + + +## .env +Script will load the following config variables to create a Snowflake connection. Write access to prod db is required, as the data ingested from the ESPN API is uploaded to a bronze table. +``` +USERNAME=@flipsidecrypto.com +ROLE= +ACCOUNT= +WAREHOUSE=<> +DATABASE=<> +SCHEMA=<> +``` diff --git a/python/espn/get_espn.py b/python/espn/get_espn.py new file mode 100644 index 0000000..71355f9 --- /dev/null +++ b/python/espn/get_espn.py @@ -0,0 +1,141 @@ +import snowflake +import pandas as pd +import requests as res +import re +import json +import os +from datetime import datetime, timezone +from dotenv import load_dotenv +from snowflake.connector.pandas_tools import write_pandas + + +def upload_to_snowflake(conn, data, table_name): + + # Convert the data to a pandas DataFrame + df = pd.DataFrame(data) + df.columns = [camel_to_snake(col) for col in df.columns] + + # Add a TIMESTAMP column + df['_INSERTED_TIMESTAMP'] = datetime.now(timezone.utc) + + df.rename(columns={'$REF': 'REF'}, inplace=True) + + # Map pandas data types to Snowflake data types + dtype_mapping = { + 'object': 'STRING', + 'int64': 'NUMBER', + 'float64': 'FLOAT', + 'datetime64[ns]': 'FLOAT', # loaded as unix timestamp. Can cast in SF using :: TIMESTAMP_NTZ + 'bool': 'BOOLEAN', + 'dict': 'VARIANT', + # Add more mappings if needed + } + + # Generate the CREATE TABLE statement + # TODO should change this to IF NOT EXISTS ? Or leave as-is because future pulls overwrite instead of append + create_table_stmt = 'CREATE OR REPLACE TABLE {} (\n'.format(table_name) + for col, dtype in df.dtypes.items(): + snowflake_dtype = dtype_mapping.get(str(dtype), 'STRING') + create_table_stmt += ' "{}" {},\n'.format(col, snowflake_dtype) + create_table_stmt = create_table_stmt.rstrip(',\n') + '\n);' + + # Execute the CREATE TABLE statement + conn.cursor().execute(create_table_stmt) + + # Upload the DataFrame to Snowflake + write_pandas(conn, df, table_name) + + +def get_espn(base_url, params): + # paginate through endpoint until no items returned to get urls for each entity + response = [] + page = 1 + while True: + r = res.get(f"{base_url}?page={page}", params=params) + if len(r.json()['items']) == 0: + break + response.extend(r.json()['items']) + page += 1 + + # log completion of url request + print(f"Retrieved {len(response)} items from {base_url}") + + data = [] + errors = [] + for i, url in enumerate(response): + try: + r = res.get(url['$ref']) + + # if success, append to data + if r.status_code == 200: + data.append(r.json()) + else: + print(f"Error getting {url['$ref']}: {r.status_code}") + # append error to list of errors + errors.append(url['$ref']) + + # log every 100 successful requests + if i % 100 == 0: + print(f"Retrieved {i} entities") + + except Exception as e: + print(f"Error getting {url['$ref']}: {e}") + # append error to list of errors + errors.append(url['$ref']) + + + # if any errors, write to file + if len(errors) > 0: + with open('errors.txt', 'w') as f: + f.write('\n'.join(errors)) + + return data + + +def camel_to_snake(name): + # Convert camel case to snake case for column naming + + name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).upper() + + +def main(): + load_dotenv() + # Establish connection to Snowflake + conn = snowflake.connector.connect( + user=os.getenv('USERNAME'), + authenticator= "externalbrowser", + account=os.getenv('ACCOUNT'), + warehouse=os.getenv('WAREHOUSE'), + database=os.getenv('DATABASE'), + schema=os.getenv('SCHEMA'), + role=os.getenv('ROLE'), + ) + + # teams_url = "https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2023/teams" + # params = { + # "limit": 50 + # } + # table_name = "ESPN_NFL_TEAMS" + + # data = get_espn(teams_url, params) + # upload_to_snowflake(conn, data, table_name) + + athletes_url = "https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/athletes" + params = { + "limit": 1000, + "active": "true" + } + table_name = "ESPN_NFL_ATHLETES" + + data = get_espn(athletes_url, params) + upload_to_snowflake(conn, data, table_name) + + # Close the connection + conn.close() + + return True + +if __name__ == '__main__': + main()