ncaa_stats_py.basketball
1# Author: Joseph Armstrong (armstrongjoseph08@gmail.com) 2# File Name: `basketball.py` 3# Purpose: Houses functions that allows one to access NCAA basketball data 4# Creation Date: 2024-09-20 08:15 PM EDT 5# Update History: 6# - 2024-09-20 08:15 PM EDT 7# - 2024-11-01 12:10 AM EDT 8# - 2024-11-25 07:45 PM EDT 9# - 2025-01-04 03:00 PM EDT 10# - 2025-01-18 02:40 PM EDT 11# - 2025-02-01 02:40 PM EDT 12# - 2025-02-05 08:50 PM EDT 13 14 15import logging 16import re 17from datetime import date, datetime 18from os import mkdir 19from os.path import exists, expanduser, getmtime 20 21import numpy as np 22import pandas as pd 23from bs4 import BeautifulSoup 24from dateutil import parser 25from pytz import timezone 26from tqdm import tqdm 27 28from ncaa_stats_py.utls import ( 29 _format_folder_str, 30 _get_minute_formatted_time_from_seconds, 31 _get_schools, 32 _get_webpage, 33) 34 35 36def get_basketball_teams( 37 season: int, 38 level: str | int, 39 get_wbb_data: bool = False 40) -> pd.DataFrame: 41 """ 42 Retrieves a list of basketball teams from the NCAA. 43 44 Parameters 45 ---------- 46 `season` (int, mandatory): 47 Required argument. 48 Specifies the season you want NCAA basketball team information from. 49 50 `level` (int, mandatory): 51 Required argument. 52 Specifies the level/division you want 53 NCAA basketball team information from. 54 This can either be an integer (1-3) or a string ("I"-"III"). 55 56 `get_wbb_data` (bool, optional): 57 Optional argument. 58 If you want women's basketball data instead of men's basketball data, 59 set this to `True`. 60 61 Usage 62 ---------- 63 ```python 64 65 from ncaa_stats_py.basketball import get_basketball_teams 66 67 ######################################## 68 # Men's Basketball # 69 ######################################## 70 71 # Get all D1 men's basketball teams for the 2024 season. 72 print("Get all D1 men's basketball teams for the 2024 season.") 73 df = get_basketball_teams(2024, 1) 74 print(df) 75 76 # Get all D2 men's basketball teams for the 2023 season. 77 print("Get all D2 men's basketball teams for the 2023 season.") 78 df = get_basketball_teams(2023, 2) 79 print(df) 80 81 # Get all D3 men's basketball teams for the 2022 season. 82 print("Get all D3 men's basketball teams for the 2022 season.") 83 df = get_basketball_teams(2022, 3) 84 print(df) 85 86 # Get all D1 men's basketball teams for the 2021 season. 87 print("Get all D1 men's basketball teams for the 2021 season.") 88 df = get_basketball_teams(2021, "I") 89 print(df) 90 91 # Get all D2 men's basketball teams for the 2020 season. 92 print("Get all D2 men's basketball teams for the 2020 season.") 93 df = get_basketball_teams(2020, "II") 94 print(df) 95 96 # Get all D3 men's basketball teams for the 2019 season. 97 print("Get all D3 men's basketball teams for the 2019 season.") 98 df = get_basketball_teams(2019, "III") 99 print(df) 100 101 ######################################## 102 # Women's Basketball # 103 ######################################## 104 105 # Get all D1 women's basketball teams for the 2024 season. 106 print( 107 "Get all D1 women's basketball teams for the 2024 season." 108 ) 109 df = get_basketball_teams(2024, 1) 110 print(df) 111 112 # Get all D2 women's basketball teams for the 2023 season. 113 print( 114 "Get all D2 women's basketball teams for the 2023 season." 115 ) 116 df = get_basketball_teams(2023, 2) 117 print(df) 118 119 # Get all D3 women's basketball teams for the 2022 season. 120 print( 121 "Get all D3 women's basketball teams for the 2022 season." 122 ) 123 df = get_basketball_teams(2022, 3) 124 print(df) 125 126 # Get all D1 women's basketball teams for the 2021 season. 127 print( 128 "Get all D1 women's basketball teams for the 2021 season." 129 ) 130 df = get_basketball_teams(2021, "I") 131 print(df) 132 133 # Get all D2 women's basketball teams for the 2020 season. 134 print( 135 "Get all D2 women's basketball teams for the 2020 season." 136 ) 137 df = get_basketball_teams(2020, "II") 138 print(df) 139 140 # Get all D3 women's basketball teams for the 2019 season. 141 print( 142 "Get all D3 women's basketball teams for the 2019 season." 143 ) 144 df = get_basketball_teams(2019, "III") 145 print(df) 146 147 ``` 148 149 Returns 150 ---------- 151 A pandas `DataFrame` object with a list of college basketball teams 152 in that season and NCAA level. 153 """ 154 # def is_comment(elem): 155 # return isinstance(elem, Comment) 156 sport_id = "" 157 # stat_sequence = 0 158 load_from_cache = True 159 home_dir = expanduser("~") 160 home_dir = _format_folder_str(home_dir) 161 teams_df = pd.DataFrame() 162 teams_df_arr = [] 163 temp_df = pd.DataFrame() 164 formatted_level = "" 165 ncaa_level = 0 166 167 if get_wbb_data is True: 168 sport_id = "WBB" 169 stat_sequence = 169 170 else: 171 sport_id = "MBB" 172 stat_sequence = 168 173 174 if isinstance(level, int) and level == 1: 175 formatted_level = "I" 176 ncaa_level = 1 177 elif isinstance(level, int) and level == 2: 178 formatted_level = "II" 179 ncaa_level = 2 180 elif isinstance(level, int) and level == 3: 181 formatted_level = "III" 182 ncaa_level = 3 183 elif isinstance(level, str) and ( 184 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 185 ): 186 ncaa_level = 1 187 formatted_level = level.upper() 188 elif isinstance(level, str) and ( 189 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 190 ): 191 ncaa_level = 2 192 formatted_level = level.upper() 193 elif isinstance(level, str) and ( 194 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 195 ): 196 ncaa_level = 3 197 formatted_level = level.upper() 198 199 if exists(f"{home_dir}/.ncaa_stats_py/"): 200 pass 201 else: 202 mkdir(f"{home_dir}/.ncaa_stats_py/") 203 204 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 205 pass 206 else: 207 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 208 209 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/"): 210 pass 211 else: 212 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}//teams/") 213 214 if exists( 215 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 216 + f"{season}_{formatted_level}_teams.csv" 217 ): 218 teams_df = pd.read_csv( 219 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 220 + f"{season}_{formatted_level}_teams.csv" 221 ) 222 file_mod_datetime = datetime.fromtimestamp( 223 getmtime( 224 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 225 + f"{season}_{formatted_level}_teams.csv" 226 ) 227 ) 228 else: 229 file_mod_datetime = datetime.today() 230 load_from_cache = False 231 232 now = datetime.today() 233 234 age = now - file_mod_datetime 235 236 if ( 237 age.days > 1 and 238 season >= (now.year - 1) and 239 now.month <= 7 240 ): 241 load_from_cache = False 242 elif age.days >= 35: 243 load_from_cache = False 244 245 if load_from_cache is True: 246 return teams_df 247 248 logging.warning( 249 f"Either we could not load {season} D{level} schools from cache, " 250 + "or it's time to refresh the cached data." 251 ) 252 schools_df = _get_schools() 253 url = ( 254 "https://stats.ncaa.org/rankings/change_sport_year_div?" 255 + f"academic_year={season}.0&division={ncaa_level}.0" + 256 f"&sport_code={sport_id}" 257 ) 258 259 response = _get_webpage(url=url) 260 261 soup = BeautifulSoup(response.text, features="lxml") 262 ranking_periods = soup.find("select", {"name": "rp", "id": "rp"}) 263 ranking_periods = ranking_periods.find_all("option") 264 265 rp_value = 0 266 found_value = False 267 268 while found_value is False: 269 # print("check") 270 for rp in ranking_periods: 271 if "final " in rp.text.lower(): 272 rp_value = rp.get("value") 273 found_value = True 274 break 275 else: 276 rp_value = rp.get("value") 277 found_value = True 278 break 279 280 url = ( 281 "https://stats.ncaa.org/rankings/institution_trends?" 282 + f"academic_year={season}.0&division={ncaa_level}.0&" 283 + f"ranking_period={rp_value}&sport_code={sport_id}" 284 + f"&sport_code={sport_id}" 285 ) 286 287 best_method = True 288 if ( 289 (season < 2015 and sport_id == "MBB") 290 ): 291 url = ( 292 "https://stats.ncaa.org/rankings/national_ranking?" 293 + f"academic_year={season}.0&division={ncaa_level}.0&" 294 + f"ranking_period={rp_value}&sport_code={sport_id}" 295 + f"&stat_seq={stat_sequence}" 296 ) 297 response = _get_webpage(url=url) 298 best_method = False 299 elif season < 2013: 300 url = ( 301 "https://stats.ncaa.org/rankings/national_ranking?" 302 + f"academic_year={season}.0&division={ncaa_level}.0&" 303 + f"ranking_period={rp_value}&sport_code={sport_id}" 304 + f"&stat_seq={stat_sequence}" 305 ) 306 response = _get_webpage(url=url) 307 best_method = False 308 else: 309 try: 310 response = _get_webpage(url=url) 311 except Exception as e: 312 logging.info(f"Found exception when loading teams `{e}`") 313 logging.info("Attempting backup method.") 314 url = ( 315 "https://stats.ncaa.org/rankings/national_ranking?" 316 + f"academic_year={season}.0&division={ncaa_level}.0&" 317 + f"ranking_period={rp_value}&sport_code={sport_id}" 318 + f"&stat_seq={stat_sequence}" 319 ) 320 response = _get_webpage(url=url) 321 best_method = False 322 323 soup = BeautifulSoup(response.text, features="lxml") 324 325 if best_method is True: 326 soup = soup.find( 327 "table", 328 {"id": "stat_grid"}, 329 ) 330 soup = soup.find("tbody") 331 t_rows = soup.find_all("tr") 332 333 for t in t_rows: 334 team_id = t.find("a") 335 team_id = team_id.get("href") 336 team_id = team_id.replace("/teams/", "") 337 team_id = int(team_id) 338 team_name = t.find_all("td")[0].text 339 team_conference_name = t.find_all("td")[1].text 340 # del team 341 temp_df = pd.DataFrame( 342 { 343 "season": season, 344 "ncaa_division": ncaa_level, 345 "ncaa_division_formatted": formatted_level, 346 "team_conference_name": team_conference_name, 347 "team_id": team_id, 348 "school_name": team_name, 349 "sport_id": sport_id, 350 }, 351 index=[0], 352 ) 353 teams_df_arr.append(temp_df) 354 del temp_df 355 else: 356 soup = soup.find( 357 "table", 358 {"id": "rankings_table"}, 359 ) 360 soup = soup.find("tbody") 361 t_rows = soup.find_all("tr") 362 363 for t in t_rows: 364 team_id = t.find("a") 365 team_id = team_id.get("href") 366 team_id = team_id.replace("/teams/", "") 367 team_id = int(team_id) 368 team = t.find_all("td")[1].get("data-order") 369 team_name, team_conference_name = team.split(",") 370 del team 371 temp_df = pd.DataFrame( 372 { 373 "season": season, 374 "ncaa_division": ncaa_level, 375 "ncaa_division_formatted": formatted_level, 376 "team_conference_name": team_conference_name, 377 "team_id": team_id, 378 "school_name": team_name, 379 "sport_id": sport_id, 380 }, 381 index=[0], 382 ) 383 teams_df_arr.append(temp_df) 384 del temp_df 385 386 teams_df = pd.concat(teams_df_arr, ignore_index=True) 387 teams_df = pd.merge( 388 left=teams_df, 389 right=schools_df, 390 on=["school_name"], 391 how="left" 392 ) 393 teams_df.sort_values(by=["team_id"], inplace=True) 394 395 teams_df.to_csv( 396 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 397 + f"{season}_{formatted_level}_teams.csv", 398 index=False, 399 ) 400 401 return teams_df 402 403 404def load_basketball_teams( 405 start_year: int = 2011, 406 get_wbb_data: bool = False 407) -> pd.DataFrame: 408 """ 409 Compiles a list of known NCAA basketball teams in NCAA basketball history. 410 411 Parameters 412 ---------- 413 `start_year` (int, optional): 414 Optional argument. 415 Specifies the first season you want 416 NCAA basketball team information from. 417 418 `get_wbb_data` (bool, optional): 419 Optional argument. 420 If you want women's basketball data instead of men's basketball data, 421 set this to `True`. 422 423 Usage 424 ---------- 425 ```python 426 427 from ncaa_stats_py.basketball import load_basketball_teams 428 429 # WARNING: Running this script "as-is" for the first time may 430 # take some time. 431 # The *N*th time you run this script will be faster. 432 433 # Load in every women's basketball team 434 # from 2011 to present day. 435 print( 436 "Load in every women's basketball team " + 437 "from 2011 to present day." 438 ) 439 df = load_basketball_teams(get_wbb_data=True) 440 print(df) 441 442 # Load in every men's basketball team 443 # from 2011 to present day. 444 print( 445 "Load in every men's basketball team " + 446 "from 2011 to present day." 447 ) 448 df = load_basketball_teams() 449 print(df) 450 451 # Load in every men's basketball team 452 # from 2020 to present day. 453 print( 454 "Load in every men's basketball team " + 455 "from 2020 to present day." 456 ) 457 df = load_basketball_teams(start_year=2020) 458 print(df) 459 460 ``` 461 462 Returns 463 ---------- 464 A pandas `DataFrame` object with a list of 465 all known college basketball teams. 466 467 """ 468 # start_year = 2008 469 470 # if get_wbb_data is True: 471 # sport_id = "WBB" 472 # else: 473 # sport_id = "MBB" 474 475 teams_df = pd.DataFrame() 476 teams_df_arr = [] 477 temp_df = pd.DataFrame() 478 479 now = datetime.now() 480 ncaa_divisions = ["I", "II", "III"] 481 if now.month > 5: 482 ncaa_seasons = [x for x in range(start_year, (now.year + 2))] 483 else: 484 ncaa_seasons = [x for x in range(start_year, (now.year + 1))] 485 486 logging.info( 487 "Loading in all NCAA basketball teams. " 488 + "If this is the first time you're seeing this message, " 489 + "it may take some time (3-10 minutes) for this to load." 490 ) 491 for s in ncaa_seasons: 492 logging.info(f"Loading in basketball teams for the {s} season.") 493 for d in ncaa_divisions: 494 try: 495 temp_df = get_basketball_teams(season=s, level=d) 496 teams_df_arr.append(temp_df) 497 del temp_df 498 except Exception as e: 499 logging.warning( 500 "Unhandled exception when trying to " + 501 f"get the teams. Full exception: `{e}`" 502 ) 503 504 505 teams_df = pd.concat(teams_df_arr, ignore_index=True) 506 teams_df = teams_df.infer_objects() 507 return teams_df 508 509 510def get_basketball_team_schedule(team_id: int) -> pd.DataFrame: 511 """ 512 Retrieves a team schedule, from a valid NCAA basketball team ID. 513 514 Parameters 515 ---------- 516 `team_id` (int, mandatory): 517 Required argument. 518 Specifies the team you want a schedule from. 519 This is separate from a school ID, which identifies the institution. 520 A team ID should be unique to a school, and a season. 521 522 Usage 523 ---------- 524 ```python 525 526 from ncaa_stats_py.basketball import get_basketball_team_schedule 527 528 ######################################## 529 # Men's Basketball # 530 ######################################## 531 532 # Get the team schedule for the 533 # 2024 Wright St. MBB team (D1, ID: 561255). 534 print( 535 "Get the team schedule for the " + 536 "2024 Wright St. MBB team (D1, ID: 561255)." 537 ) 538 df = get_basketball_team_schedule(561255) 539 print(df) 540 541 # Get the team schedule for the 542 # 2023 Caldwell MBB team (D2, ID: 542813). 543 print( 544 "Get the team schedule for the " + 545 "2023 Caldwell MBB team (D2, ID: 542813)." 546 ) 547 df = get_basketball_team_schedule(542813) 548 print(df) 549 550 # Get the team schedule for the 551 # 2022 SUNY Maritime MBB team (D3, ID: 528097). 552 print( 553 "Get the team schedule for the " + 554 "2022 SUNY Maritime MBB team (D3, ID: 528097)." 555 ) 556 df = get_basketball_team_schedule(528097) 557 print(df) 558 559 ######################################## 560 # Women's Basketball # 561 ######################################## 562 563 # Get the team schedule for the 564 # 2021 Wake Forest WBB team (D1, ID: 506339). 565 print( 566 "Get the team schedule for the " + 567 "2021 Wake Forest WBB team (D1, ID: 506339)." 568 ) 569 df = get_basketball_team_schedule(506339) 570 print(df) 571 572 # Get the team schedule for the 573 # 2020 Trevecca Nazarene WBB team (D2, ID: 484527). 574 print( 575 "Get the team schedule for the " + 576 "2020 Trevecca Nazarene WBB team (D2, ID: 484527)." 577 ) 578 df = get_basketball_team_schedule(484527) 579 print(df) 580 581 # Get the team schedule for the 582 # 2019 Simpson WBB team (D3, ID: 452452). 583 print( 584 "Get the team schedule for the " + 585 "2019 Simpson WBB team (D3, ID: 452452)." 586 ) 587 df = get_basketball_team_schedule(452452) 588 print(df) 589 590 ``` 591 592 Returns 593 ---------- 594 A pandas `DataFrame` object with an NCAA basketball team's schedule. 595 596 """ 597 598 sport_id = "" 599 schools_df = _get_schools() 600 games_df = pd.DataFrame() 601 games_df_arr = [] 602 season = 0 603 temp_df = pd.DataFrame() 604 load_from_cache = True 605 606 home_dir = expanduser("~") 607 home_dir = _format_folder_str(home_dir) 608 609 url = f"https://stats.ncaa.org/teams/{team_id}" 610 611 try: 612 team_df = load_basketball_teams() 613 team_df = team_df[team_df["team_id"] == team_id] 614 season = team_df["season"].iloc[0] 615 ncaa_division = team_df["ncaa_division"].iloc[0] 616 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 617 sport_id = "MBB" 618 except Exception: 619 team_df = load_basketball_teams(get_wbb_data=True) 620 team_df = team_df[team_df["team_id"] == team_id] 621 season = team_df["season"].iloc[0] 622 ncaa_division = team_df["ncaa_division"].iloc[0] 623 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 624 sport_id = "WBB" 625 # team_conference_name = team_df["team_conference_name"].iloc[0] 626 # school_name = team_df["school_name"].iloc[0] 627 # school_id = int(team_df["school_id"].iloc[0]) 628 629 del team_df 630 631 if exists(f"{home_dir}/.ncaa_stats_py/"): 632 pass 633 else: 634 mkdir(f"{home_dir}/.ncaa_stats_py/") 635 636 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 637 pass 638 else: 639 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 640 641 if exists( 642 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 643 ): 644 pass 645 else: 646 mkdir( 647 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 648 ) 649 650 if exists( 651 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 652 + f"{team_id}_team_schedule.csv" 653 ): 654 games_df = pd.read_csv( 655 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 656 + f"{team_id}_team_schedule.csv" 657 ) 658 file_mod_datetime = datetime.fromtimestamp( 659 getmtime( 660 f"{home_dir}/.ncaa_stats_py/" 661 + f"basketball_{sport_id}/team_schedule/" 662 + f"{team_id}_team_schedule.csv" 663 ) 664 ) 665 else: 666 file_mod_datetime = datetime.today() 667 load_from_cache = False 668 669 now = datetime.today() 670 671 age = now - file_mod_datetime 672 if ( 673 age.days > 1 and 674 season >= now.year 675 ): 676 load_from_cache = False 677 678 if load_from_cache is True: 679 return games_df 680 681 response = _get_webpage(url=url) 682 soup = BeautifulSoup(response.text, features="lxml") 683 684 school_name = soup.find("div", {"class": "card"}).find("img").get("alt") 685 season_name = ( 686 soup.find("select", {"id": "year_list"}) 687 .find("option", {"selected": "selected"}) 688 .text 689 ) 690 # For NCAA basketball, the season always starts in the fall semester, 691 # and ends in the spring semester. 692 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 693 # because 2012 would encompass the fall and spring semesters 694 # for NCAA member institutions. 695 # season = f"{season_name[0:2]}{season_name[-2:]}" 696 # season = int(season) 697 soup = soup.find_all( 698 "div", 699 {"class": "col p-0"}, 700 ) 701 702 # declaring it here to prevent potential problems down the road. 703 table_data = "" 704 for s in soup: 705 try: 706 temp_name = s.find("div", {"class": "card-header"}) 707 temp_name = temp_name.text 708 except Exception as e: 709 logging.warning( 710 f"Could not parse card header. Full exception `{e}`. " 711 + "Attempting alternate method." 712 ) 713 temp_name = s.find("tr", {"class": "heading"}).find("td").text 714 715 if "schedule" in temp_name.lower(): 716 table_data = s.find("table") 717 718 t_rows = table_data.find_all("tr", {"class": "underline_rows"}) 719 720 if len(t_rows) == 0: 721 t_rows = table_data.find_all("tr") 722 723 for g in t_rows: 724 is_valid_row = True 725 game_num = 1 726 ot_periods = 0 727 is_home_game = True 728 is_neutral_game = False 729 730 cells = g.find_all("td") 731 if len(cells) <= 1: 732 # Because of how *well* designed 733 # stats.ncaa.org is, if we have to use execute 734 # the `if len(t_rows) == 0:` code, 735 # we need to catch any cases where every element in a 736 # table row (`<tr>`) is a table header (`<th>`), 737 # instead of a table data cell (`<td>`) 738 continue 739 740 game_date = cells[0].text 741 742 # If "(" is in the same cell as the date, 743 # this means that this game is an extra innings game. 744 # The number encased in `()` is the actual number of innings. 745 # We need to remove that from the date, 746 # and move it into a separate variable. 747 if "(" in game_date: 748 game_date = game_date.replace(")", "") 749 game_date, game_num = game_date.split("(") 750 game_date = game_date.strip() 751 game_num = int(game_num.strip()) 752 753 game_date = datetime.strptime(game_date, "%m/%d/%Y").date() 754 755 try: 756 opp_team_id = cells[1].find("a").get("href") 757 except IndexError: 758 logging.info( 759 "Skipping row because it is clearly " 760 + "not a row that has schedule data." 761 ) 762 is_valid_row = False 763 except AttributeError as e: 764 logging.info( 765 "Could not extract a team ID for this game. " + 766 f"Full exception {e}" 767 ) 768 opp_team_id = "-1" 769 except Exception as e: 770 logging.warning( 771 "An unhandled exception has occurred when " 772 + "trying to get the opposition team ID for this game. " 773 f"Full exception `{e}`." 774 ) 775 raise e 776 if is_valid_row is True: 777 if opp_team_id is not None: 778 opp_team_id = opp_team_id.replace("/teams/", "") 779 opp_team_id = int(opp_team_id) 780 781 try: 782 opp_team_name = cells[1].find("img").get("alt") 783 except AttributeError: 784 logging.info( 785 "Couldn't find the opposition team name " 786 + "for this row from an image element. " 787 + "Attempting a backup method" 788 ) 789 opp_team_name = cells[1].text 790 except Exception as e: 791 logging.info( 792 "Unhandled exception when trying to get the " 793 + "opposition team name from this game. " 794 + f"Full exception `{e}`" 795 ) 796 raise e 797 else: 798 opp_team_name = cells[1].text 799 800 if opp_team_name[0] == "@": 801 # The logic for determining if this game was a 802 # neutral site game doesn't care if that info is in 803 # `opp_team_name`. 804 opp_team_name = opp_team_name.strip().replace("@", "") 805 elif "@" in opp_team_name: 806 opp_team_name = opp_team_name.strip().split("@")[0] 807 # opp_team_show_name = cells[1].text.strip() 808 809 opp_text = cells[1].text 810 opp_text = opp_text.strip() 811 if "@" in opp_text and opp_text[0] == "@": 812 is_home_game = False 813 elif "@" in opp_text and opp_text[0] != "@": 814 is_neutral_game = True 815 is_home_game = False 816 # This is just to cover conference and NCAA championship 817 # tournaments. 818 elif "championship" in opp_text.lower(): 819 is_neutral_game = True 820 is_home_game = False 821 elif "ncaa" in opp_text.lower(): 822 is_neutral_game = True 823 is_home_game = False 824 825 del opp_text 826 827 score = cells[2].text.strip() 828 if len(score) == 0: 829 score_1 = 0 830 score_2 = 0 831 elif ( 832 "canceled" not in score.lower() and 833 "ppd" not in score.lower() 834 ): 835 score_1, score_2 = score.split("-") 836 837 # `score_1` should be "W `n`", "L `n`", or "T `n`", 838 # with `n` representing the number of runs this team 839 # scored in this game. 840 # Let's remove the "W", "L", or "T" from `score_1`, 841 # and determine which team won later on in this code. 842 if any(x in score_1 for x in ["W", "L", "T"]): 843 score_1 = score_1.split(" ")[1] 844 845 if "(" in score_2: 846 score_2 = score_2.replace(")", "") 847 score_2, ot_periods = score_2.split("(") 848 ot_periods = ot_periods.replace("OT", "") 849 ot_periods = ot_periods.replace(" ", "") 850 ot_periods = int(ot_periods) 851 852 if ot_periods is None: 853 ot_periods = 0 854 score_1 = int(score_1) 855 score_2 = int(score_2) 856 else: 857 score_1 = None 858 score_2 = None 859 860 try: 861 game_id = cells[2].find("a").get("href") 862 game_id = game_id.replace("/contests", "") 863 game_id = game_id.replace("/box_score", "") 864 game_id = game_id.replace("/", "") 865 game_id = int(game_id) 866 game_url = ( 867 f"https://stats.ncaa.org/contests/{game_id}/box_score" 868 ) 869 870 except AttributeError as e: 871 logging.info( 872 "Could not parse a game ID for this game. " 873 + f"Full exception `{e}`." 874 ) 875 game_id = None 876 game_url = None 877 except Exception as e: 878 logging.info( 879 "An unhandled exception occurred when trying " 880 + "to find a game ID for this game. " 881 + f"Full exception `{e}`." 882 ) 883 raise e 884 try: 885 attendance = cells[3].text 886 attendance = attendance.replace(",", "") 887 attendance = attendance.replace("\n", "") 888 attendance = int(attendance) 889 except IndexError as e: 890 logging.info( 891 "It doesn't appear as if there is an attendance column " 892 + "for this team's schedule table." 893 f"Full exception `{e}`." 894 ) 895 attendance = None 896 except ValueError as e: 897 logging.info( 898 "There doesn't appear as if " 899 + "there is a recorded attendance. " 900 + "for this game/row. " 901 f"Full exception `{e}`." 902 ) 903 attendance = None 904 905 except Exception as e: 906 logging.info( 907 "An unhandled exception occurred when trying " 908 + "to find this game's attendance. " 909 + f"Full exception `{e}`." 910 ) 911 raise e 912 913 if is_home_game is True: 914 temp_df = pd.DataFrame( 915 { 916 "season": season, 917 "season_name": season_name, 918 "game_id": game_id, 919 "game_date": game_date, 920 "game_num": game_num, 921 "ot_periods": ot_periods, 922 "home_team_id": team_id, 923 "home_team_name": school_name, 924 "away_team_id": opp_team_id, 925 "away_team_name": opp_team_name, 926 "home_team_score": score_1, 927 "away_team_score": score_2, 928 "is_neutral_game": is_neutral_game, 929 "game_url": game_url, 930 }, 931 index=[0], 932 ) 933 games_df_arr.append(temp_df) 934 del temp_df 935 elif is_neutral_game is True: 936 # For the sake of simplicity, 937 # order both team ID's, 938 # and set the lower number of the two as 939 # the "away" team in this neutral site game, 940 # just so there's no confusion if someone 941 # combines a ton of these team schedule `DataFrame`s, 942 # and wants to remove duplicates afterwards. 943 t_ids = [opp_team_id, team_id] 944 t_ids.sort() 945 946 if t_ids[0] == team_id: 947 # home 948 temp_df = pd.DataFrame( 949 { 950 "season": season, 951 "season_name": season_name, 952 "game_id": game_id, 953 "game_date": game_date, 954 "game_num": game_num, 955 "ot_periods": ot_periods, 956 "home_team_id": team_id, 957 "home_team_name": school_name, 958 "away_team_id": opp_team_id, 959 "away_team_name": opp_team_name, 960 "home_team_score": score_1, 961 "away_team_score": score_2, 962 "is_neutral_game": is_neutral_game, 963 "game_url": game_url, 964 }, 965 index=[0], 966 ) 967 968 else: 969 # away 970 temp_df = pd.DataFrame( 971 { 972 "season": season, 973 "season_name": season_name, 974 "game_id": game_id, 975 "game_date": game_date, 976 "game_num": game_num, 977 "ot_periods": ot_periods, 978 "home_team_id": opp_team_id, 979 "home_team_name": opp_team_name, 980 "away_team_id": team_id, 981 "away_team_name": school_name, 982 "home_team_score": score_2, 983 "away_team_score": score_1, 984 "is_neutral_game": is_neutral_game, 985 "game_url": game_url, 986 }, 987 index=[0], 988 ) 989 990 games_df_arr.append(temp_df) 991 del temp_df 992 else: 993 temp_df = pd.DataFrame( 994 { 995 "season": season, 996 "season_name": season_name, 997 "game_id": game_id, 998 "game_date": game_date, 999 "game_num": game_num, 1000 "ot_periods": ot_periods, 1001 "home_team_id": opp_team_id, 1002 "home_team_name": opp_team_name, 1003 "away_team_id": team_id, 1004 "away_team_name": school_name, 1005 "home_team_score": score_2, 1006 "away_team_score": score_1, 1007 "is_neutral_game": is_neutral_game, 1008 "game_url": game_url, 1009 }, 1010 index=[0], 1011 ) 1012 1013 games_df_arr.append(temp_df) 1014 del temp_df 1015 1016 # team_photo = team_id.find("img").get("src") 1017 1018 games_df = pd.concat(games_df_arr, ignore_index=True) 1019 1020 temp_df = schools_df.rename( 1021 columns={ 1022 "school_name": "home_team_name", 1023 "school_id": "home_school_id" 1024 } 1025 ) 1026 games_df = games_df.merge(right=temp_df, on="home_team_name", how="left") 1027 1028 temp_df = schools_df.rename( 1029 columns={ 1030 "school_name": "away_team_name", 1031 "school_id": "away_school_id" 1032 } 1033 ) 1034 games_df = games_df.merge(right=temp_df, on="away_team_name", how="left") 1035 games_df["ncaa_division"] = ncaa_division 1036 games_df["ncaa_division_formatted"] = ncaa_division_formatted 1037 1038 # games_df["game_url"] = games_df["game_url"].str.replace("/box_score", "") 1039 games_df.to_csv( 1040 f"{home_dir}/.ncaa_stats_py/" 1041 + f"basketball_{sport_id}/team_schedule/" 1042 + f"{team_id}_team_schedule.csv", 1043 index=False, 1044 ) 1045 1046 return games_df 1047 1048 1049def get_basketball_day_schedule( 1050 game_date: str | date | datetime, 1051 level: str | int = "I", 1052 get_wbb_data: bool = False 1053): 1054 """ 1055 Given a date and NCAA level, this function retrieves basketball every game 1056 for that date. 1057 1058 Parameters 1059 ---------- 1060 `game_date` (int, mandatory): 1061 Required argument. 1062 Specifies the date you want a basketball schedule from. 1063 For best results, pass a string formatted as "YYYY-MM-DD". 1064 1065 `level` (int, mandatory): 1066 Required argument. 1067 Specifies the level/division you want a 1068 NCAA basketball schedule from. 1069 This can either be an integer (1-3) or a string ("I"-"III"). 1070 1071 `get_wbb_data` (bool, optional): 1072 Optional argument. 1073 If you want women's basketball data instead of men's basketball data, 1074 set this to `True`. 1075 1076 Usage 1077 ---------- 1078 ```python 1079 1080 from ncaa_stats_py.basketball import get_basketball_day_schedule 1081 1082 1083 # Get all DI games that will be played on April 22th, 2025. 1084 print("Get all games that will be played on April 22th, 2025.") 1085 df = get_basketball_day_schedule("2025-04-22", level=1) 1086 print(df) 1087 1088 # Get all division II games that were played on February 14th, 2025. 1089 print("Get all division II games that were played on February 14th, 2025.") 1090 df = get_basketball_day_schedule("2025-02-14", level="I") 1091 print(df) 1092 1093 # Get all DI games that were played on December 10th, 2024. 1094 print("Get all games that were played on December 10th, 2024.") 1095 df = get_basketball_day_schedule("2024-12-10", level="I") 1096 print(df) 1097 1098 # Get all DI games (if any) that were played on December 12th, 2024. 1099 print("Get all DI games (if any) that were played on December 12th, 2024.") 1100 df = get_basketball_day_schedule("2024-12-12") 1101 print(df) 1102 1103 # Get all DII games played on January 14th, 2024. 1104 print("Get all DI games played on January 14th, 2024.") 1105 df = get_basketball_day_schedule("2024-01-14") 1106 print(df) 1107 1108 # Get all division III games played on December 16th, 2023. 1109 print("Get all division III games played on December 16th, 2023.") 1110 df = get_basketball_day_schedule("2023-12-16") 1111 print(df) 1112 1113 ``` 1114 1115 Returns 1116 ---------- 1117 A pandas `DataFrame` object with all basketball games played on that day, 1118 for that NCAA division/level. 1119 1120 """ 1121 1122 season = 0 1123 sport_id = "MBB" 1124 1125 schedule_df = pd.DataFrame() 1126 schedule_df_arr = [] 1127 1128 if isinstance(game_date, date): 1129 game_datetime = datetime.combine( 1130 game_date, datetime.min.time() 1131 ) 1132 elif isinstance(game_date, datetime): 1133 game_datetime = game_date 1134 elif isinstance(game_date, str): 1135 game_datetime = parser.parse( 1136 game_date 1137 ) 1138 else: 1139 unhandled_datatype = type(game_date) 1140 raise ValueError( 1141 f"Unhandled datatype for `game_date`: `{unhandled_datatype}`" 1142 ) 1143 1144 if isinstance(level, int) and level == 1: 1145 formatted_level = "I" 1146 ncaa_level = 1 1147 elif isinstance(level, int) and level == 2: 1148 formatted_level = "II" 1149 ncaa_level = 2 1150 elif isinstance(level, int) and level == 3: 1151 formatted_level = "III" 1152 ncaa_level = 3 1153 elif isinstance(level, str) and ( 1154 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 1155 ): 1156 ncaa_level = 1 1157 formatted_level = level.upper() 1158 elif isinstance(level, str) and ( 1159 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 1160 ): 1161 ncaa_level = 2 1162 formatted_level = level.upper() 1163 elif isinstance(level, str) and ( 1164 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 1165 ): 1166 ncaa_level = 3 1167 formatted_level = level.upper() 1168 1169 del level 1170 1171 if get_wbb_data is True: 1172 sport_id = "WBB" 1173 elif get_wbb_data is False: 1174 sport_id = "MBB" 1175 else: 1176 raise ValueError( 1177 f"Unhandled value for `get_wbb_data`: `{get_wbb_data}`" 1178 ) 1179 1180 season = game_datetime.year 1181 game_month = game_datetime.month 1182 game_day = game_datetime.day 1183 game_year = game_datetime.year 1184 1185 if game_month > 7: 1186 season += 1 1187 url = ( 1188 "https://stats.ncaa.org/contests/" + 1189 f"livestream_scoreboards?utf8=%E2%9C%93&sport_code={sport_id}" + 1190 f"&academic_year={season}&division={ncaa_level}" + 1191 f"&game_date={game_month:00d}%2F{game_day:00d}%2F{game_year}" + 1192 "&commit=Submit" 1193 ) 1194 else: 1195 url = ( 1196 "https://stats.ncaa.org/contests/" + 1197 f"livestream_scoreboards?utf8=%E2%9C%93&sport_code={sport_id}" + 1198 f"&academic_year={season}&division={ncaa_level}" + 1199 f"&game_date={game_month:00d}%2F{game_day:00d}%2F{game_year}" + 1200 "&commit=Submit" 1201 ) 1202 1203 response = _get_webpage(url=url) 1204 soup = BeautifulSoup(response.text, features="lxml") 1205 1206 game_boxes = soup.find_all("div", {"class": "table-responsive"}) 1207 1208 for box in game_boxes: 1209 game_id = None 1210 game_alt_text = None 1211 game_num = 1 1212 # t_box = box.find("table") 1213 table_box = box.find("table") 1214 table_rows = table_box.find_all("tr") 1215 1216 # Date/attendance 1217 game_date_str = table_rows[0].find("div", {"class": "col-6 p-0"}).text 1218 game_date_str = game_date_str.replace("\n", "") 1219 game_date_str = game_date_str.strip() 1220 game_date_str = game_date_str.replace("TBA ", "TBA") 1221 game_date_str = game_date_str.replace("TBD ", "TBD") 1222 game_date_str = game_date_str.replace("PM ", "PM") 1223 game_date_str = game_date_str.replace("AM ", "AM") 1224 game_date_str = game_date_str.strip() 1225 attendance_str = table_rows[0].find( 1226 "div", 1227 {"class": "col p-0 text-right"} 1228 ).text 1229 1230 attendance_str = attendance_str.replace("Attend:", "") 1231 attendance_str = attendance_str.replace(",", "") 1232 attendance_str = attendance_str.replace("\n", "") 1233 if ( 1234 "st" in attendance_str.lower() or 1235 "nd" in attendance_str.lower() or 1236 "rd" in attendance_str.lower() or 1237 "th" in attendance_str.lower() 1238 ): 1239 # This is not an attendance, 1240 # this is whatever quarter/half/inning this game is in. 1241 attendance_num = None 1242 elif "final" in attendance_str.lower(): 1243 attendance_num = None 1244 elif len(attendance_str) > 0: 1245 attendance_num = int(attendance_str) 1246 else: 1247 attendance_num = None 1248 1249 if "(" in game_date_str: 1250 game_date_str = game_date_str.replace(")", "") 1251 game_date_str, game_num = game_date_str.split("(") 1252 game_num = int(game_num) 1253 1254 if "TBA" in game_date_str: 1255 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBA') 1256 elif "tba" in game_date_str: 1257 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tba') 1258 elif "TBD" in game_date_str: 1259 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBD') 1260 elif "tbd" in game_date_str: 1261 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tbd') 1262 elif ( 1263 "tbd" not in game_date_str.lower() and 1264 ":" not in game_date_str.lower() 1265 ): 1266 game_date_str = game_date_str.replace(" ", "") 1267 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y') 1268 else: 1269 game_datetime = datetime.strptime( 1270 game_date_str, 1271 '%m/%d/%Y %I:%M %p' 1272 ) 1273 game_datetime = game_datetime.astimezone(timezone("US/Eastern")) 1274 1275 game_alt_text = table_rows[1].find_all("td")[0].text 1276 if game_alt_text is not None and len(game_alt_text) > 0: 1277 game_alt_text = game_alt_text.replace("\n", "") 1278 game_alt_text = game_alt_text.strip() 1279 1280 if len(game_alt_text) == 0: 1281 game_alt_text = None 1282 1283 urls_arr = box.find_all("a") 1284 1285 for u in urls_arr: 1286 url_temp = u.get("href") 1287 if "contests" in url_temp: 1288 game_id = url_temp 1289 del url_temp 1290 1291 if game_id is None: 1292 for r in range(0, len(table_rows)): 1293 temp = table_rows[r] 1294 temp_id = temp.get("id") 1295 1296 if temp_id is not None and len(temp_id) > 0: 1297 game_id = temp_id 1298 1299 del urls_arr 1300 1301 game_id = game_id.replace("/contests", "") 1302 game_id = game_id.replace("/box_score", "") 1303 game_id = game_id.replace("/livestream_scoreboards", "") 1304 game_id = game_id.replace("/", "") 1305 game_id = game_id.replace("contest_", "") 1306 game_id = int(game_id) 1307 1308 table_rows = table_box.find_all("tr", {"id": f"contest_{game_id}"}) 1309 away_team_row = table_rows[0] 1310 home_team_row = table_rows[1] 1311 1312 # Away team 1313 td_arr = away_team_row.find_all("td") 1314 1315 try: 1316 away_team_name = td_arr[0].find("img").get("alt") 1317 except Exception: 1318 away_team_name = td_arr[1].text 1319 away_team_name = away_team_name.replace("\n", "") 1320 away_team_name = away_team_name.strip() 1321 1322 try: 1323 away_team_id = td_arr[1].find("a").get("href") 1324 away_team_id = away_team_id.replace("/teams/", "") 1325 away_team_id = int(away_team_id) 1326 except AttributeError: 1327 away_team_id = None 1328 logging.info("No team ID found for the away team") 1329 except Exception as e: 1330 raise e 1331 1332 away_points_scored = td_arr[-1].text 1333 away_points_scored = away_points_scored.replace("\n", "") 1334 away_points_scored = away_points_scored.replace("\xa0", "") 1335 if len(away_points_scored) > 0: 1336 away_points_scored = int(away_points_scored) 1337 else: 1338 away_points_scored = 0 1339 1340 del td_arr 1341 1342 # Home team 1343 td_arr = home_team_row.find_all("td") 1344 1345 try: 1346 home_team_name = td_arr[0].find("img").get("alt") 1347 except Exception: 1348 home_team_name = td_arr[1].text 1349 home_team_name = home_team_name.replace("\n", "") 1350 home_team_name = home_team_name.strip() 1351 1352 try: 1353 home_team_id = td_arr[1].find("a").get("href") 1354 home_team_id = home_team_id.replace("/teams/", "") 1355 home_team_id = int(home_team_id) 1356 except AttributeError: 1357 home_team_id = None 1358 logging.info("No team ID found for the home team") 1359 except Exception as e: 1360 raise e 1361 1362 home_points_scored = td_arr[-1].text 1363 home_points_scored = home_points_scored.replace("\n", "") 1364 home_points_scored = home_points_scored.replace("\xa0", "") 1365 if len(home_points_scored) > 0: 1366 home_points_scored = int(home_points_scored) 1367 else: 1368 home_points_scored = 0 1369 1370 temp_df = pd.DataFrame( 1371 { 1372 "season": season, 1373 "sport_id": sport_id, 1374 "game_date": game_datetime.strftime("%Y-%m-%d"), 1375 "game_datetime": game_datetime.isoformat(), 1376 "game_id": game_id, 1377 "formatted_level": formatted_level, 1378 "ncaa_level": ncaa_level, 1379 "game_alt_text": game_alt_text, 1380 "away_team_id": away_team_id, 1381 "away_team_name": away_team_name, 1382 "home_team_id": home_team_id, 1383 "home_team_name": home_team_name, 1384 "home_points_scored": home_points_scored, 1385 "away_points_scored": away_points_scored, 1386 "attendance": attendance_num 1387 }, 1388 index=[0] 1389 ) 1390 schedule_df_arr.append(temp_df) 1391 1392 del temp_df 1393 1394 if len(schedule_df_arr) >= 1: 1395 schedule_df = pd.concat(schedule_df_arr, ignore_index=True) 1396 else: 1397 logging.warning( 1398 "Could not find any game(s) for " 1399 + f"{game_datetime.year:00d}-{game_datetime.month:00d}" 1400 + f"-{game_datetime.day:00d}. " 1401 + "If you believe this is an error, " 1402 + "please raise an issue at " 1403 + "\n https://github.com/armstjc/ncaa_stats_py/issues \n" 1404 ) 1405 return schedule_df 1406 1407 1408def get_full_basketball_schedule( 1409 season: int, 1410 level: str | int = "I", 1411 get_wbb_data: bool = False 1412) -> pd.DataFrame: 1413 """ 1414 Retrieves a full basketball schedule, 1415 from an NCAA level (`"I"`, `"II"`, `"III"`). 1416 The way this is done is by going through every team in a division, 1417 and parsing the schedules of every team in a division. 1418 1419 This function will take time when first run (30-60 minutes)! 1420 You have been warned. 1421 1422 Parameters 1423 ---------- 1424 `season` (int, mandatory): 1425 Specifies the season you want a schedule from. 1426 1427 `level` (int | str, mandatory): 1428 Specifies the team you want a schedule from. 1429 1430 `get_wbb_data` (bool, optional): 1431 Optional argument. 1432 If you want women's basketball data instead of men's basketball data, 1433 set this to `True`. 1434 1435 Usage 1436 ---------- 1437 ```python 1438 1439 from ncaa_stats_py.basketball import get_full_basketball_schedule 1440 1441 # Get the entire 2024 schedule for the 2024 D1 basketball season. 1442 print("Get the entire 2024 schedule for the 2024 D1 basketball season.") 1443 df = get_full_basketball_schedule(season=2024, level="I") 1444 print(df) 1445 1446 # You can also input `level` as an integer. 1447 # In addition, this and other functions cache data, 1448 # so this should load very quickly 1449 # compared to the first run of this function. 1450 print("You can also input `level` as an integer.") 1451 print( 1452 "In addition, this and other functions cache data, " 1453 + "so this should load very quickly " 1454 + "compared to the first run of this function." 1455 ) 1456 df = get_full_basketball_schedule(season=2024, level=1) 1457 print(df) 1458 1459 ``` 1460 1461 Returns 1462 ---------- 1463 A pandas `DataFrame` object with an NCAA basketball 1464 schedule for a specific season and level. 1465 """ 1466 1467 sport_id = "" 1468 load_from_cache = True 1469 home_dir = expanduser("~") 1470 home_dir = _format_folder_str(home_dir) 1471 schedule_df = pd.DataFrame() 1472 schedule_df_arr = [] 1473 temp_df = pd.DataFrame() 1474 formatted_level = "" 1475 ncaa_level = 0 1476 1477 if get_wbb_data is True: 1478 sport_id = "WBB" 1479 else: 1480 sport_id = "MBB" 1481 1482 if isinstance(level, int) and level == 1: 1483 formatted_level = "I" 1484 ncaa_level = 1 1485 elif isinstance(level, int) and level == 2: 1486 formatted_level = "II" 1487 ncaa_level = 2 1488 elif isinstance(level, int) and level == 3: 1489 formatted_level = "III" 1490 ncaa_level = 3 1491 elif isinstance(level, str) and ( 1492 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 1493 ): 1494 ncaa_level = 1 1495 formatted_level = level.upper() 1496 elif isinstance(level, str) and ( 1497 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 1498 ): 1499 ncaa_level = 2 1500 formatted_level = level.upper() 1501 elif isinstance(level, str) and ( 1502 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 1503 ): 1504 ncaa_level = 3 1505 formatted_level = level.upper() 1506 1507 del level 1508 1509 if exists(f"{home_dir}/.ncaa_stats_py/"): 1510 pass 1511 else: 1512 mkdir(f"{home_dir}/.ncaa_stats_py/") 1513 1514 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 1515 pass 1516 else: 1517 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 1518 1519 if exists( 1520 f"{home_dir}/.ncaa_stats_py/" + 1521 f"basketball_{sport_id}/full_schedule/" 1522 ): 1523 pass 1524 else: 1525 mkdir( 1526 f"{home_dir}/.ncaa_stats_py/" + 1527 f"basketball_{sport_id}/full_schedule/" 1528 ) 1529 1530 if exists( 1531 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/full_schedule/" 1532 + f"{season}_{formatted_level}_full_schedule.csv" 1533 ): 1534 teams_df = pd.read_csv( 1535 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/full_schedule/" 1536 + f"{season}_{formatted_level}_full_schedule.csv" 1537 ) 1538 file_mod_datetime = datetime.fromtimestamp( 1539 getmtime( 1540 f"{home_dir}/.ncaa_stats_py/" + 1541 f"basketball_{sport_id}/full_schedule/" 1542 + f"{season}_{formatted_level}_full_schedule.csv" 1543 ) 1544 ) 1545 else: 1546 file_mod_datetime = datetime.today() 1547 load_from_cache = False 1548 1549 now = datetime.today() 1550 1551 age = now - file_mod_datetime 1552 1553 if ( 1554 age.days > 1 and 1555 season >= now.year 1556 ): 1557 load_from_cache = False 1558 1559 if load_from_cache is True: 1560 return teams_df 1561 1562 teams_df = load_basketball_teams() 1563 teams_df = teams_df[ 1564 (teams_df["season"] == season) & 1565 (teams_df["ncaa_division"] == ncaa_level) 1566 ] 1567 team_ids_arr = teams_df["team_id"].to_numpy() 1568 1569 for team_id in tqdm(team_ids_arr): 1570 temp_df = get_basketball_team_schedule(team_id=team_id) 1571 schedule_df_arr.append(temp_df) 1572 1573 schedule_df = pd.concat(schedule_df_arr, ignore_index=True) 1574 schedule_df = schedule_df.drop_duplicates(subset="game_id", keep="first") 1575 schedule_df.to_csv( 1576 f"{home_dir}/.ncaa_stats_py/" 1577 + f"basketball_{sport_id}/full_schedule/" 1578 + f"{season}_{formatted_level}_full_schedule.csv", 1579 index=False, 1580 ) 1581 return schedule_df 1582 1583 1584def get_basketball_team_roster(team_id: int) -> pd.DataFrame: 1585 """ 1586 Retrieves a basketball team's roster from a given team ID. 1587 1588 Parameters 1589 ---------- 1590 `team_id` (int, mandatory): 1591 Required argument. 1592 Specifies the team you want a roster from. 1593 This is separate from a school ID, which identifies the institution. 1594 A team ID should be unique to a school, and a season. 1595 1596 Usage 1597 ---------- 1598 ```python 1599 1600 from ncaa_stats_py.basketball import get_basketball_team_roster 1601 1602 ######################################## 1603 # Men's Basketball # 1604 ######################################## 1605 1606 # Get the basketball roster for the 1607 # 2024 Alabama St. MBB team (D1, ID: 560655). 1608 print( 1609 "Get the basketball roster for the " + 1610 "2024 Alabama St. MBB team (D1, ID: 560655)." 1611 ) 1612 df = get_basketball_team_roster(560655) 1613 print(df) 1614 1615 # Get the basketball roster for the 1616 # 2023 Roberts Wesleyan MBB team (D2, ID: 542994). 1617 print( 1618 "Get the basketball roster for the " + 1619 "2023 Roberts Wesleyan MBB team (D2, ID: 542994)." 1620 ) 1621 df = get_basketball_team_roster(542994) 1622 print(df) 1623 1624 # Get the basketball roster for the 1625 # 2022 Pacific Lutheran MBB team (D3, ID: 528255). 1626 print( 1627 "Get the basketball roster for the " + 1628 "2022 Pacific Lutheran MBB team (D3, ID: 528255)." 1629 ) 1630 df = get_basketball_team_roster(528255) 1631 print(df) 1632 1633 ######################################## 1634 # Women's Basketball # 1635 ######################################## 1636 1637 # Get the basketball roster for the 1638 # 2021 Michigan St. WBB team (D1, ID: 506069). 1639 print( 1640 "Get the basketball roster for the " + 1641 "2021 Michigan St. WBB team (D1, ID: 506069)." 1642 ) 1643 df = get_basketball_team_roster(506069) 1644 print(df) 1645 1646 # Get the basketball roster for the 1647 # 2020 Shippensburg WBB team (D2, ID: 484864). 1648 print( 1649 "Get the basketball roster for the " + 1650 "2020 Shippensburg WBB team (D2, ID: 484864)." 1651 ) 1652 df = get_basketball_team_roster(484864) 1653 print(df) 1654 1655 # Get the basketball roster for the 1656 # 2019 Maranatha Baptist team (D3, ID: 452546). 1657 print( 1658 "Get the basketball roster for the " + 1659 "2019 Maranatha Baptist team (D3, ID: 452546)." 1660 ) 1661 df = get_basketball_team_roster(452546) 1662 print(df) 1663 1664 ``` 1665 1666 Returns 1667 ---------- 1668 A pandas `DataFrame` object with 1669 an NCAA basketball team's roster for that season. 1670 """ 1671 sport_id = "" 1672 roster_df = pd.DataFrame() 1673 roster_df_arr = [] 1674 temp_df = pd.DataFrame() 1675 url = f"https://stats.ncaa.org/teams/{team_id}/roster" 1676 load_from_cache = True 1677 home_dir = expanduser("~") 1678 home_dir = _format_folder_str(home_dir) 1679 1680 stat_columns = [ 1681 "season", 1682 "season_name", 1683 "sport_id", 1684 "ncaa_division", 1685 "ncaa_division_formatted", 1686 "team_conference_name", 1687 "school_id", 1688 "school_name", 1689 "player_id", 1690 "player_jersey_num", 1691 "player_full_name", 1692 "player_first_name", 1693 "player_last_name", 1694 "player_class", 1695 "player_positions", 1696 "player_height_string", 1697 "player_weight", 1698 "player_hometown", 1699 "player_high_school", 1700 "player_G", 1701 "player_GS", 1702 "player_url", 1703 ] 1704 1705 try: 1706 team_df = load_basketball_teams() 1707 team_df = team_df[team_df["team_id"] == team_id] 1708 1709 season = team_df["season"].iloc[0] 1710 ncaa_division = team_df["ncaa_division"].iloc[0] 1711 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 1712 team_conference_name = team_df["team_conference_name"].iloc[0] 1713 school_name = team_df["school_name"].iloc[0] 1714 school_id = int(team_df["school_id"].iloc[0]) 1715 sport_id = "MBB" 1716 except Exception: 1717 team_df = load_basketball_teams(get_wbb_data=True) 1718 team_df = team_df[team_df["team_id"] == team_id] 1719 1720 season = team_df["season"].iloc[0] 1721 ncaa_division = team_df["ncaa_division"].iloc[0] 1722 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 1723 team_conference_name = team_df["team_conference_name"].iloc[0] 1724 school_name = team_df["school_name"].iloc[0] 1725 school_id = int(team_df["school_id"].iloc[0]) 1726 school_id = int(team_df["school_id"].iloc[0]) 1727 sport_id = "WBB" 1728 1729 del team_df 1730 1731 if exists(f"{home_dir}/.ncaa_stats_py/"): 1732 pass 1733 else: 1734 mkdir(f"{home_dir}/.ncaa_stats_py/") 1735 1736 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 1737 pass 1738 else: 1739 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 1740 1741 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/"): 1742 pass 1743 else: 1744 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/") 1745 1746 if exists( 1747 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1748 f"{team_id}_roster.csv" 1749 ): 1750 teams_df = pd.read_csv( 1751 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1752 f"{team_id}_roster.csv" 1753 ) 1754 file_mod_datetime = datetime.fromtimestamp( 1755 getmtime( 1756 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1757 f"{team_id}_roster.csv" 1758 ) 1759 ) 1760 else: 1761 file_mod_datetime = datetime.today() 1762 load_from_cache = False 1763 1764 now = datetime.today() 1765 1766 age = now - file_mod_datetime 1767 1768 if ( 1769 age.days >= 14 and 1770 season >= now.year 1771 ): 1772 load_from_cache = False 1773 1774 if load_from_cache is True: 1775 return teams_df 1776 1777 response = _get_webpage(url=url) 1778 soup = BeautifulSoup(response.text, features="lxml") 1779 try: 1780 school_name = soup.find( 1781 "div", 1782 {"class": "card"} 1783 ).find("img").get("alt") 1784 except Exception: 1785 school_name = soup.find("div", {"class": "card"}).find("a").text 1786 school_name = school_name.rsplit(" ", maxsplit=1)[0] 1787 1788 season_name = ( 1789 soup.find("select", {"id": "year_list"}) 1790 .find("option", {"selected": "selected"}) 1791 .text 1792 ) 1793 # For NCAA basketball, the season always starts in the spring semester, 1794 # and ends in the fall semester. 1795 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 1796 # because 2012 would encompass the spring and fall semesters 1797 # for NCAA member institutions. 1798 season = f"{season_name[0:2]}{season_name[-2:]}" 1799 season = int(season) 1800 1801 try: 1802 table = soup.find( 1803 "table", 1804 {"class": "dataTable small_font"}, 1805 ) 1806 1807 table_headers = table.find("thead").find_all("th") 1808 except Exception: 1809 table = soup.find( 1810 "table", 1811 {"class": "dataTable small_font no_padding"}, 1812 ) 1813 1814 table_headers = table.find("thead").find_all("th") 1815 table_headers = [x.text for x in table_headers] 1816 1817 t_rows = table.find("tbody").find_all("tr") 1818 1819 for t in t_rows: 1820 t_cells = t.find_all("td") 1821 t_cells = [x.text for x in t_cells] 1822 1823 temp_df = pd.DataFrame( 1824 data=[t_cells], 1825 columns=table_headers, 1826 # index=[0] 1827 ) 1828 1829 player_id = t.find("a").get("href") 1830 # temp_df["school_name"] = school_name 1831 temp_df["player_url"] = f"https://stats.ncaa.org{player_id}" 1832 1833 player_id = player_id.replace("/players", "").replace("/", "") 1834 player_id = int(player_id) 1835 1836 temp_df["player_id"] = player_id 1837 1838 roster_df_arr.append(temp_df) 1839 del temp_df 1840 1841 roster_df = pd.concat(roster_df_arr, ignore_index=True) 1842 roster_df = roster_df.infer_objects() 1843 roster_df["season"] = season 1844 roster_df["season_name"] = season_name 1845 roster_df["ncaa_division"] = ncaa_division 1846 roster_df["ncaa_division_formatted"] = ncaa_division_formatted 1847 roster_df["team_conference_name"] = team_conference_name 1848 roster_df["school_id"] = school_id 1849 roster_df["school_name"] = school_name 1850 roster_df["sport_id"] = sport_id 1851 1852 roster_df.rename( 1853 columns={ 1854 "GP": "player_G", 1855 "GS": "player_GS", 1856 "#": "player_jersey_num", 1857 "Name": "player_full_name", 1858 "Class": "player_class", 1859 "Position": "player_positions", 1860 "Height": "player_height_string", 1861 "Hometown": "player_hometown", 1862 "High School": "player_high_school", 1863 }, 1864 inplace=True 1865 ) 1866 1867 roster_df[["player_first_name", "player_last_name"]] = roster_df[ 1868 "player_full_name" 1869 ].str.split(" ", n=1, expand=True) 1870 roster_df = roster_df.infer_objects() 1871 1872 for i in roster_df.columns: 1873 if i in stat_columns: 1874 pass 1875 else: 1876 raise ValueError( 1877 f"Unhandled column name {i}" 1878 ) 1879 1880 roster_df = roster_df.infer_objects().reindex(columns=stat_columns) 1881 1882 roster_df.to_csv( 1883 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1884 f"{team_id}_roster.csv", 1885 index=False, 1886 ) 1887 return roster_df 1888 1889 1890def get_basketball_player_season_stats( 1891 team_id: int, 1892) -> pd.DataFrame: 1893 """ 1894 Given a team ID, this function retrieves and parses 1895 the season stats for all of the players in a given basketball team. 1896 1897 Parameters 1898 ---------- 1899 `team_id` (int, mandatory): 1900 Required argument. 1901 Specifies the team you want basketball stats from. 1902 This is separate from a school ID, which identifies the institution. 1903 A team ID should be unique to a school, and a season. 1904 1905 Usage 1906 ---------- 1907 ```python 1908 1909 from ncaa_stats_py.basketball import get_basketball_player_season_stats 1910 1911 ######################################## 1912 # Men's Basketball # 1913 ######################################## 1914 1915 # Get the season stats for the 1916 # 2024 Illinois MBB team (D1, ID: 560955). 1917 print( 1918 "Get the season stats for the " + 1919 "2024 Illinois MBB team (D1, ID: 560955)." 1920 ) 1921 df = get_basketball_player_season_stats(560955) 1922 print(df) 1923 1924 # Get the season stats for the 1925 # 2023 Chico St. MBB team (D2, ID: 542605). 1926 print( 1927 "Get the season stats for the " + 1928 "2023 Chico St. MBB team (D2, ID: 542605)." 1929 ) 1930 df = get_basketball_player_season_stats(542605) 1931 print(df) 1932 1933 # Get the season stats for the 1934 # 2022 Maine Maritime MBB team (D3, ID: 528070). 1935 print( 1936 "Get the season stats for the " + 1937 "2022 Maine Maritime MBB team (D3, ID: 528070)." 1938 ) 1939 df = get_basketball_player_season_stats(528070) 1940 print(df) 1941 1942 ######################################## 1943 # Women's Basketball # 1944 ######################################## 1945 1946 # Get the season stats for the 1947 # 2021 Louisville WBB team (D1, ID: 506050). 1948 print( 1949 "Get the season stats for the " + 1950 "2021 Louisville WBB team (D1, ID: 506050)." 1951 ) 1952 df = get_basketball_player_season_stats(506050) 1953 print(df) 1954 1955 # Get the season stats for the 1956 # 2020 Paine WBB team (D2, ID: 484830). 1957 print( 1958 "Get the season stats for the " + 1959 "2020 Paine WBB team (D2, ID: 484830)." 1960 ) 1961 df = get_basketball_player_season_stats(484830) 1962 print(df) 1963 1964 # Get the season stats for the 1965 # 2019 Pomona-Pitzer team (D3, ID: 452413). 1966 print( 1967 "Get the season stats for the " + 1968 "2019 Pomona-Pitzer team (D3, ID: 452413)." 1969 ) 1970 df = get_basketball_player_season_stats(452413) 1971 print(df) 1972 1973 ``` 1974 1975 Returns 1976 ---------- 1977 A pandas `DataFrame` object with the season batting stats for 1978 all players with a given NCAA basketball team. 1979 """ 1980 1981 sport_id = "" 1982 load_from_cache = True 1983 stats_df = pd.DataFrame() 1984 stats_df_arr = [] 1985 temp_df = pd.DataFrame() 1986 1987 stat_columns = [ 1988 "season", 1989 "season_name", 1990 "sport_id", 1991 "team_id", 1992 "team_conference_name", 1993 "school_id", 1994 "school_name", 1995 "ncaa_division", 1996 "ncaa_division_formatted", 1997 "player_id", 1998 "player_jersey_number", 1999 "player_last_name", 2000 "player_first_name", 2001 "player_full_name", 2002 "player_class", 2003 "player_position", 2004 "player_height", 2005 "GP", 2006 "GS", 2007 "MP_str", 2008 "MP_minutes", 2009 "MP_seconds", 2010 "MP_total_seconds", 2011 "FGM", 2012 "FGA", 2013 "FG%", 2014 "eFG%", 2015 "TSA", 2016 "TS%", 2017 "2PM", 2018 "2PA", 2019 "2FG%", 2020 "3PM", 2021 "3PA", 2022 "3FG%", 2023 "FT", 2024 "FTA", 2025 "FT%", 2026 "PTS", 2027 "ORB", 2028 "DRB", 2029 "TRB", 2030 "Avg", 2031 "AST", 2032 "TOV", 2033 "TOV%", 2034 "STL", 2035 "BLK", 2036 "PF", 2037 "DBL_DBL", 2038 "TRP_DBL", 2039 "DQ", 2040 "TF", 2041 ] 2042 2043 # if get_wbb_data is True: 2044 # sport_id = "WBB" 2045 # else: 2046 # sport_id = "MBB" 2047 2048 try: 2049 team_df = load_basketball_teams() 2050 2051 team_df = team_df[team_df["team_id"] == team_id] 2052 2053 season = team_df["season"].iloc[0] 2054 ncaa_division = int(team_df["ncaa_division"].iloc[0]) 2055 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2056 team_conference_name = team_df["team_conference_name"].iloc[0] 2057 school_name = team_df["school_name"].iloc[0] 2058 school_id = int(team_df["school_id"].iloc[0]) 2059 sport_id = "MBB" 2060 except Exception: 2061 team_df = load_basketball_teams(get_wbb_data=True) 2062 2063 team_df = team_df[team_df["team_id"] == team_id] 2064 2065 season = team_df["season"].iloc[0] 2066 ncaa_division = int(team_df["ncaa_division"].iloc[0]) 2067 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2068 team_conference_name = team_df["team_conference_name"].iloc[0] 2069 school_name = team_df["school_name"].iloc[0] 2070 school_id = int(team_df["school_id"].iloc[0]) 2071 sport_id = "WBB" 2072 2073 del team_df 2074 2075 # stat_id = _get_stat_id( 2076 # sport="basketball", 2077 # season=season, 2078 # stat_type="batting" 2079 # ) 2080 2081 home_dir = expanduser("~") 2082 home_dir = _format_folder_str(home_dir) 2083 2084 url = f"https://stats.ncaa.org/teams/{team_id}/season_to_date_stats" 2085 2086 if exists(f"{home_dir}/.ncaa_stats_py/"): 2087 pass 2088 else: 2089 mkdir(f"{home_dir}/.ncaa_stats_py/") 2090 2091 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 2092 pass 2093 else: 2094 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 2095 2096 if exists( 2097 f"{home_dir}/.ncaa_stats_py/" + 2098 f"basketball_{sport_id}/player_season_stats/" 2099 ): 2100 pass 2101 else: 2102 mkdir( 2103 f"{home_dir}/.ncaa_stats_py/" + 2104 f"basketball_{sport_id}/player_season_stats/" 2105 ) 2106 2107 if exists( 2108 f"{home_dir}/.ncaa_stats_py/" + 2109 f"basketball_{sport_id}/player_season_stats/" 2110 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2111 ): 2112 games_df = pd.read_csv( 2113 f"{home_dir}/.ncaa_stats_py/" + 2114 f"basketball_{sport_id}/player_season_stats/" 2115 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2116 ) 2117 file_mod_datetime = datetime.fromtimestamp( 2118 getmtime( 2119 f"{home_dir}/.ncaa_stats_py/" + 2120 f"basketball_{sport_id}/player_season_stats/" 2121 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2122 ) 2123 ) 2124 else: 2125 file_mod_datetime = datetime.today() 2126 load_from_cache = False 2127 2128 now = datetime.today() 2129 2130 age = now - file_mod_datetime 2131 2132 if ( 2133 age.days > 1 and 2134 season >= now.year 2135 ): 2136 load_from_cache = False 2137 2138 if load_from_cache is True: 2139 return games_df 2140 2141 response = _get_webpage(url=url) 2142 soup = BeautifulSoup(response.text, features="lxml") 2143 # try: 2144 # school_name = soup.find( 2145 # "div", {"class": "card"} 2146 # ).find("img").get("alt") 2147 # except Exception: 2148 # school_name = soup.find("div", {"class": "card"}).find("a").text 2149 # school_name = school_name.rsplit(" ", maxsplit=1)[0] 2150 2151 season_name = ( 2152 soup.find("select", {"id": "year_list"}) 2153 .find("option", {"selected": "selected"}) 2154 .text 2155 ) 2156 # For NCAA basketball, the season always starts in the fall semester, 2157 # and ends in the spring semester. 2158 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 2159 # because 2012 would encompass the fall and spring semesters 2160 # for NCAA member institutions. 2161 season = f"{season_name[0:2]}{season_name[-2:]}" 2162 season = int(season) 2163 2164 # stat_categories_arr = soup.find( 2165 # "ul", {"class": "nav nav-tabs padding-nav"} 2166 # ).find_all("a") 2167 2168 table_data = soup.find( 2169 "table", 2170 {"id": "stat_grid", "class": "small_font dataTable table-bordered"}, 2171 ) 2172 2173 temp_table_headers = table_data.find("thead").find("tr").find_all("th") 2174 table_headers = [x.text for x in temp_table_headers] 2175 2176 del temp_table_headers 2177 2178 t_rows = table_data.find("tbody").find_all("tr", {"class": "text"}) 2179 for t in t_rows: 2180 p_last = "" 2181 p_first = "" 2182 t_cells = t.find_all("td") 2183 if "team" in t_cells[1].text.lower(): 2184 continue 2185 p_sortable = t_cells[1].get("data-order") 2186 if len(p_sortable) == 2: 2187 p_last, p_first = p_sortable.split(",") 2188 elif len(p_sortable) == 3: 2189 p_last, temp_name, p_first = p_sortable.split(",") 2190 p_last = f"{p_last} {temp_name}" 2191 2192 t_cells = [x.text.strip() for x in t_cells] 2193 2194 temp_df = pd.DataFrame( 2195 data=[t_cells], 2196 columns=table_headers, 2197 # index=[0] 2198 ) 2199 2200 player_id = t.find("a").get("href") 2201 2202 # temp_df["player_url"] = f"https://stats.ncaa.org{player_id}" 2203 player_id = player_id.replace("/players", "").replace("/", "") 2204 2205 # stat_id = -1 2206 # if "year_stat_category_id" in player_id: 2207 # stat_id = player_id 2208 # stat_id = stat_id.rsplit("?")[-1] 2209 # stat_id = stat_id.replace("?", "").replace( 2210 # "year_stat_category_id=", "" 2211 # ) 2212 # stat_id = int(stat_id) 2213 2214 # player_id = player_id.split("?")[0] 2215 2216 player_id = int(player_id) 2217 2218 temp_df["player_id"] = player_id 2219 temp_df["player_last_name"] = p_last.strip() 2220 temp_df["player_first_name"] = p_first.strip() 2221 2222 stats_df_arr.append(temp_df) 2223 del temp_df 2224 2225 stats_df = pd.concat(stats_df_arr, ignore_index=True) 2226 stats_df = stats_df.replace("", None) 2227 2228 # stats_df["stat_id"] = stat_id 2229 stats_df["season"] = season 2230 stats_df["season_name"] = season_name 2231 stats_df["school_id"] = school_id 2232 stats_df["school_name"] = school_name 2233 stats_df["ncaa_division"] = ncaa_division 2234 stats_df["ncaa_division_formatted"] = ncaa_division_formatted 2235 stats_df["team_conference_name"] = team_conference_name 2236 stats_df["sport_id"] = sport_id 2237 stats_df["team_id"] = team_id 2238 2239 stats_df = stats_df.infer_objects() 2240 2241 stats_df.rename( 2242 columns={ 2243 "#": "player_jersey_number", 2244 "Player": "player_full_name", 2245 "Yr": "player_class", 2246 "Pos": "player_position", 2247 "Ht": "player_height", 2248 "B/T": "player_bats_throws", 2249 "3FG": "3PM", 2250 "3FGA": "3PA", 2251 "ORebs": "ORB", 2252 "DRebs": "DRB", 2253 "Tot Reb": "TRB", 2254 "TO": "TOV", 2255 "Dbl Dbl": "DBL_DBL", 2256 "Trpl Dbl": "TRP_DBL", 2257 "Fouls": "PF", 2258 'Tech Fouls': "TF", 2259 'Effective FG Pct.': "eFG%", 2260 "MP": "MP_str", 2261 "Min": "MP_str", 2262 "Off Reb": "ORB", 2263 "Def Reb": "DRB", 2264 "ST": "STL", 2265 "BLKS": "BLK" 2266 }, 2267 inplace=True, 2268 ) 2269 stats_df = stats_df.infer_objects().fillna(0) 2270 stats_df = stats_df.astype( 2271 { 2272 "GP": "uint16", 2273 "GS": "uint16", 2274 "FGM": "uint16", 2275 "FGA": "uint16", 2276 "3PM": "uint16", 2277 "3PA": "uint16", 2278 "FT": "uint16", 2279 "FTA": "uint16", 2280 "PTS": "uint16", 2281 "ORB": "uint16", 2282 "DRB": "uint16", 2283 "TRB": "uint16", 2284 "AST": "uint16", 2285 "TOV": "uint16", 2286 "STL": "uint16", 2287 "BLK": "uint16", 2288 "PF": "uint16", 2289 "DBL_DBL": "uint16", 2290 "TRP_DBL": "uint16", 2291 "school_id": "uint32", 2292 } 2293 ) 2294 2295 # This is a separate function call because these stats 2296 # *don't* exist in every season. 2297 if "DQ" not in stats_df.columns: 2298 stats_df["DQ"] = None 2299 2300 if "TF" not in stats_df.columns: 2301 stats_df["TF"] = None 2302 2303 stats_df = stats_df.astype( 2304 { 2305 "DQ": "uint16", 2306 "TF": "uint16", 2307 }, 2308 errors="ignore" 2309 ) 2310 2311 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 2312 ":", expand=True 2313 ) 2314 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2315 "MP_minutes", "MP_seconds" 2316 ]].astype("uint64") 2317 stats_df["MP_total_seconds"] = ( 2318 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 2319 ) 2320 2321 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 2322 stats_df["FG%"] = stats_df["FG%"].round(4) 2323 2324 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 2325 stats_df["3P%"] = stats_df["3P%"].round(4) 2326 2327 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 2328 stats_df["FT%"] = stats_df["FT%"].round(4) 2329 2330 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 2331 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 2332 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 2333 stats_df["2P%"] = stats_df["2P%"].round(4) 2334 2335 stats_df["eFG%"] = ( 2336 ( 2337 stats_df["FGM"] + 2338 (stats_df["3PM"] * 0.5) 2339 ) / 2340 stats_df["FGA"] 2341 ) 2342 stats_df["eFG%"] = stats_df["eFG%"].round(4) 2343 2344 stats_df["TSA"] = ( 2345 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 2346 ) 2347 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 2348 stats_df["TS%"] = stats_df["TS%"].round(4) 2349 2350 stats_df["TOV%"] = ( 2351 stats_df["TOV"] / 2352 ( 2353 stats_df["FGA"] + 2354 (stats_df["FTA"] * 0.44) + 2355 stats_df["TOV"] 2356 ) 2357 ) 2358 stats_df["TOV%"] = stats_df["TOV%"].round(4) 2359 # In many seasons, there is an ["Avg"] column 2360 # that would otherwise completely screw up 2361 # any attempts to use the final DataFrame, 2362 # because it would be a duplicate column 2363 # that pandas wouldn't complain about 2364 # until it's too late. 2365 2366 duplicate_cols = stats_df.columns[stats_df.columns.duplicated()] 2367 stats_df.drop(columns=duplicate_cols, inplace=True) 2368 # stats_df = stats_df.T.drop_duplicates().T 2369 stats_df = stats_df.reindex(columns=stat_columns) 2370 # print(stats_df.columns) 2371 stats_df.to_csv( 2372 f"{home_dir}/.ncaa_stats_py/" + 2373 f"basketball_{sport_id}/player_season_stats/" + 2374 f"{season:00d}_{school_id:00d}_player_season_stats.csv", 2375 index=False, 2376 ) 2377 2378 return stats_df 2379 2380 2381def get_basketball_player_game_stats( 2382 player_id: int, 2383 season: int 2384) -> pd.DataFrame: 2385 """ 2386 Given a valid player ID and season, 2387 this function retrieves the game stats for this player at a game level. 2388 2389 Parameters 2390 ---------- 2391 `player_id` (int, mandatory): 2392 Required argument. 2393 Specifies the player you want game stats from. 2394 2395 `season` (int, mandatory): 2396 Required argument. 2397 Specifies the season you want game stats from. 2398 2399 Usage 2400 ---------- 2401 ```python 2402 2403 from ncaa_stats_py.basketball import ( 2404 get_basketball_player_game_stats 2405 ) 2406 2407 # Get the batting stats of Jacob Berry in 2022 (LSU). 2408 print( 2409 "Get the batting stats of Jacob Berry in 2022 (LSU)." 2410 ) 2411 df = get_basketball_player_game_stats(player_id=7579336, season=2022) 2412 print(df) 2413 2414 # Get the batting stats of Alec Burleson in 2019 (ECU). 2415 print( 2416 "Get the batting stats of Alec Burleson in 2019 (ECU)." 2417 ) 2418 df = get_basketball_player_game_stats(player_id=6015715, season=2019) 2419 print(df) 2420 2421 # Get the batting stats of Hunter Bishop in 2018 (Arizona St.). 2422 print( 2423 "Get the batting stats of Hunter Bishop in 2018 (Arizona St.)." 2424 ) 2425 df = get_basketball_player_game_stats(player_id=6014052, season=2019) 2426 print(df) 2427 2428 ``` 2429 2430 Returns 2431 ---------- 2432 A pandas `DataFrame` object with a player's batting game logs 2433 in a given season. 2434 """ 2435 sport_id = "" 2436 2437 stat_columns = [ 2438 "season", 2439 "game_id", 2440 "game_num", 2441 "player_id", 2442 "date", 2443 "opponent", 2444 "Result", 2445 "team_score", 2446 "opponent_score", 2447 "MP_str", 2448 "MP_minutes", 2449 "MP_seconds", 2450 "MP_total_seconds", 2451 "GP", 2452 "GS", 2453 "FGM", 2454 "FGA", 2455 "FG%", 2456 "eFG%", 2457 "2PM", 2458 "2PA", 2459 "2P%", 2460 "3PM", 2461 "3PA", 2462 "3P%", 2463 "FT", 2464 "FTA", 2465 "FT%", 2466 "ORB", 2467 "DRB", 2468 "TRB", 2469 "AST", 2470 "TOV", 2471 "TOV%", 2472 "STL", 2473 "BLK", 2474 "PF", 2475 "DQ", 2476 "TF", 2477 "TSA", 2478 "TS%", 2479 "PTS", 2480 "DBL_DBL", 2481 "TRP_DBL", 2482 ] 2483 load_from_cache = True 2484 stats_df = pd.DataFrame() 2485 stats_df_arr = [] 2486 temp_df = pd.DataFrame() 2487 home_dir = expanduser("~") 2488 home_dir = _format_folder_str(home_dir) 2489 2490 # stat_id = _get_stat_id( 2491 # sport="basketball", 2492 # season=season, 2493 # stat_type="batting" 2494 # ) 2495 url = f"https://stats.ncaa.org/players/{player_id}" 2496 2497 if exists(f"{home_dir}/.ncaa_stats_py/"): 2498 pass 2499 else: 2500 mkdir(f"{home_dir}/.ncaa_stats_py/") 2501 2502 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 2503 pass 2504 else: 2505 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 2506 2507 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/"): 2508 pass 2509 else: 2510 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/") 2511 2512 if exists( 2513 f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/" 2514 + f"{season}_{player_id}_player_game_stats.csv" 2515 ): 2516 games_df = pd.read_csv( 2517 f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/" 2518 + f"{season}_{player_id}_player_game_stats.csv" 2519 ) 2520 file_mod_datetime = datetime.fromtimestamp( 2521 getmtime( 2522 f"{home_dir}/.ncaa_stats_py/basketball_MBB/" 2523 + "player_game_stats/" 2524 + f"{season}_{player_id}_player_game_stats.csv" 2525 ) 2526 ) 2527 games_df = games_df.infer_objects() 2528 load_from_cache = True 2529 else: 2530 file_mod_datetime = datetime.today() 2531 load_from_cache = False 2532 2533 if exists(f"{home_dir}/.ncaa_stats_py/"): 2534 pass 2535 else: 2536 mkdir(f"{home_dir}/.ncaa_stats_py/") 2537 2538 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 2539 pass 2540 else: 2541 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 2542 2543 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/"): 2544 pass 2545 else: 2546 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/") 2547 2548 if exists( 2549 f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/" 2550 + f"{season}_{player_id}_player_game_stats.csv" 2551 ): 2552 games_df = pd.read_csv( 2553 f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/" 2554 + f"{season}_{player_id}_player_game_stats.csv" 2555 ) 2556 file_mod_datetime = datetime.fromtimestamp( 2557 getmtime( 2558 f"{home_dir}/.ncaa_stats_py/basketball_WBB/" 2559 + "player_game_stats/" 2560 + f"{season}_{player_id}_player_game_stats.csv" 2561 ) 2562 ) 2563 games_df = games_df.infer_objects() 2564 load_from_cache = True 2565 else: 2566 logging.info("Could not find a WBB player game stats file") 2567 2568 now = datetime.today() 2569 2570 age = now - file_mod_datetime 2571 2572 if ( 2573 age.days > 1 and 2574 (season - 1) >= now.year 2575 ): 2576 load_from_cache = False 2577 2578 if load_from_cache is True: 2579 return games_df 2580 2581 # team_df = load_basketball_teams() 2582 2583 # team_df = team_df[team_df["team_id"] == team_id] 2584 2585 # season = team_df["season"].iloc[0] 2586 # ncaa_division = team_df["ncaa_division"].iloc[0] 2587 # ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2588 # team_conference_name = team_df["team_conference_name"].iloc[0] 2589 # school_name = team_df["school_name"].iloc[0] 2590 # school_id = int(team_df["school_id"].iloc[0]) 2591 2592 # del team_df 2593 response = _get_webpage(url=url) 2594 soup = BeautifulSoup(response.text, features="lxml") 2595 2596 table_navigation = soup.find("ul", {"class": "nav nav-tabs padding-nav"}) 2597 table_nav_card = table_navigation.find_all("a") 2598 2599 for u in table_nav_card: 2600 url_str = u.get("href") 2601 if "MBB" in url_str.upper(): 2602 sport_id = "MBB" 2603 elif "WBB" in url_str.upper(): 2604 sport_id = "WBB" 2605 2606 if sport_id is None or len(sport_id) == 0: 2607 # This should **never** be the case IRL, 2608 # but in case something weird happened and 2609 # we can't make a determination of if this is a 2610 # MBB player or a WBB player, and we somehow haven't 2611 # crashed by this point, set the sport ID to 2612 # "MBB" by default so we don't have other weirdness. 2613 logging.error( 2614 f"Could not determine if player ID {player_id} " + 2615 "is a MBB or a WBB player. " + 2616 "Because this cannot be determined, " + 2617 "we will make the automatic assumption that this is a MBB player." 2618 ) 2619 sport_id = "MBB" 2620 2621 table_data = soup.find_all( 2622 "table", {"class": "small_font dataTable table-bordered"} 2623 )[1] 2624 2625 temp_table_headers = table_data.find("thead").find("tr").find_all("th") 2626 table_headers = [x.text for x in temp_table_headers] 2627 2628 del temp_table_headers 2629 2630 temp_t_rows = table_data.find("tbody") 2631 temp_t_rows = temp_t_rows.find_all("tr") 2632 2633 for t in temp_t_rows: 2634 game_num = 1 2635 ot_periods = 0 2636 # innings = 9 2637 row_id = t.get("id") 2638 opp_team_name = "" 2639 2640 if "contest" not in row_id: 2641 continue 2642 del row_id 2643 2644 t_cells = t.find_all("td") 2645 t_cells = [x.text.strip() for x in t_cells] 2646 2647 g_date = t_cells[0] 2648 2649 if "(" in g_date: 2650 g_date, game_num = g_date.split("(") 2651 g_date = g_date.strip() 2652 2653 game_num = game_num.replace(")", "") 2654 game_num = int(game_num) 2655 2656 try: 2657 opp_team_id = t.find_all("td")[1].find("a").get("href") 2658 except AttributeError as e: 2659 logging.info( 2660 "Could not extract a team ID for this game. " + 2661 f"Full exception {e}" 2662 ) 2663 except Exception as e: 2664 logging.warning( 2665 "An unhandled exception has occurred when " 2666 + "trying to get the opposition team ID for this game. " 2667 f"Full exception `{e}`." 2668 ) 2669 raise e 2670 2671 try: 2672 opp_team_id = opp_team_id.replace("/teams/", "") 2673 opp_team_id = opp_team_id.replace( 2674 "javascript:toggleDefensiveStats(", "" 2675 ) 2676 opp_team_id = opp_team_id.replace(");", "") 2677 opp_team_id = int(opp_team_id) 2678 2679 temp_df["opponent_team_id"] = opp_team_id 2680 except Exception: 2681 logging.info( 2682 "Couldn't find the opposition team naIDme " 2683 + "for this row. " 2684 ) 2685 opp_team_id = None 2686 # print(i.find("td").text) 2687 try: 2688 opp_team_name = t.find_all("td")[1].find_all("img")[1].get("alt") 2689 except AttributeError: 2690 logging.info( 2691 "Couldn't find the opposition team name " 2692 + "for this row from an image element. " 2693 + "Attempting a backup method" 2694 ) 2695 opp_team_name = t_cells[1] 2696 except IndexError: 2697 logging.info( 2698 "Couldn't find the opposition team name " 2699 + "for this row from an image element. " 2700 + "Attempting a backup method" 2701 ) 2702 opp_team_name = t_cells[1] 2703 except Exception as e: 2704 logging.warning( 2705 "Unhandled exception when trying to get the " 2706 + "opposition team name from this game. " 2707 + f"Full exception `{e}`" 2708 ) 2709 raise e 2710 2711 if opp_team_name == "Defensive Stats": 2712 opp_team_name = t_cells[1] 2713 2714 if "@" in opp_team_name: 2715 opp_team_name = opp_team_name.split("@")[0] 2716 2717 result_str = t_cells[2] 2718 2719 result_str = ( 2720 result_str.lower().replace("w", "").replace("l", "").replace( 2721 "t", "" 2722 ) 2723 ) 2724 2725 if ( 2726 result_str.lower() == "ppd" or 2727 result_str.lower() == "" or 2728 result_str.lower() == "canceed" 2729 ): 2730 continue 2731 2732 result_str = result_str.replace("\n", "") 2733 result_str = result_str.replace("*", "") 2734 2735 tm_score, opp_score = result_str.split("-") 2736 t_cells = [x.replace("*", "") for x in t_cells] 2737 t_cells = [x.replace("/", "") for x in t_cells] 2738 t_cells = [x.replace("\\", "") for x in t_cells] 2739 2740 temp_df = pd.DataFrame( 2741 data=[t_cells], 2742 columns=table_headers, 2743 # index=[0] 2744 ) 2745 2746 tm_score = int(tm_score) 2747 if "(" in opp_score: 2748 opp_score = opp_score.replace(")", "") 2749 opp_score, ot_periods = opp_score.split("(") 2750 temp_df["ot_periods"] = ot_periods 2751 2752 if "\n" in opp_score: 2753 opp_score = opp_score.strip() 2754 # opp_score = opp_score 2755 opp_score = int(opp_score) 2756 2757 temp_df["team_score"] = tm_score 2758 temp_df["opponent_score"] = opp_score 2759 2760 del tm_score 2761 del opp_score 2762 2763 g_id = t.find_all("td")[2].find("a").get("href") 2764 2765 g_id = g_id.replace("/contests", "") 2766 g_id = g_id.replace("/box_score", "") 2767 g_id = g_id.replace("/", "") 2768 2769 g_id = int(g_id) 2770 temp_df["game_id"] = g_id 2771 2772 del g_id 2773 temp_df.rename( 2774 columns={"Opponent": "opponent", "Date": "date"}, 2775 inplace=True, 2776 ) 2777 game_date = datetime.strptime(g_date, "%m/%d/%Y").date() 2778 2779 temp_df["date"] = game_date 2780 temp_df["game_num"] = game_num 2781 # temp_df["game_innings"] = innings 2782 2783 if len(opp_team_name) > 0: 2784 temp_df["opponent"] = opp_team_name 2785 del opp_team_name 2786 2787 duplicate_cols = temp_df.columns[temp_df.columns.duplicated()] 2788 temp_df.drop(columns=duplicate_cols, inplace=True) 2789 2790 stats_df_arr.append(temp_df) 2791 del temp_df 2792 2793 stats_df = pd.concat(stats_df_arr, ignore_index=True) 2794 stats_df = stats_df.replace("/", "", regex=True) 2795 stats_df = stats_df.replace("", np.nan) 2796 stats_df = stats_df.infer_objects() 2797 2798 stats_df["player_id"] = player_id 2799 stats_df["season"] = season 2800 # In many seasons, there is an ["Avg"] column 2801 # that would otherwise completely screw up 2802 # any attempts to use the final DataFrame, 2803 # because it would be a duplicate column 2804 # that pandas wouldn't complain about 2805 # until it's too late. 2806 2807 duplicate_cols = stats_df.columns[stats_df.columns.duplicated()] 2808 stats_df.drop(columns=duplicate_cols, inplace=True) 2809 2810 stats_df.rename( 2811 columns={ 2812 "#": "player_jersey_number", 2813 "Player": "player_full_name", 2814 "Yr": "player_class", 2815 "Pos": "player_position", 2816 "Ht": "player_height", 2817 "B/T": "player_bats_throws", 2818 "3FG": "3PM", 2819 "3FGA": "3PA", 2820 "ORebs": "ORB", 2821 "DRebs": "DRB", 2822 "Tot Reb": "TRB", 2823 "TO": "TOV", 2824 "Dbl Dbl": "DBL_DBL", 2825 "Trpl Dbl": "TRP_DBL", 2826 "Fouls": "PF", 2827 'Tech Fouls': "TF", 2828 'Effective FG Pct.': "eFG%", 2829 "MP": "MP_str", 2830 "Min": "MP_str", 2831 "Off Reb": "ORB", 2832 "Def Reb": "DRB", 2833 "ST": "STL", 2834 "3FG%": "3P%", 2835 "BLKS": "BLK" 2836 }, 2837 inplace=True, 2838 ) 2839 2840 # This is a separate function call because these stats 2841 # *don't* exist in every season. 2842 if "GS" not in stats_df.columns: 2843 stats_df["GS"] = None 2844 2845 if "DQ" not in stats_df.columns: 2846 stats_df["DQ"] = None 2847 2848 if "TF" not in stats_df.columns: 2849 stats_df["TF"] = None 2850 2851 if "DBL_DBL" not in stats_df.columns: 2852 stats_df["DBL_DBL"] = None 2853 2854 if "TRP_DBL" not in stats_df.columns: 2855 stats_df["TRP_DBL"] = None 2856 2857 stats_df = stats_df.astype( 2858 { 2859 "DQ": "uint16", 2860 "TF": "uint16", 2861 }, 2862 errors="ignore" 2863 ) 2864 2865 stats_df = stats_df.infer_objects().fillna(0) 2866 stats_df = stats_df.astype( 2867 { 2868 "GP": "uint16", 2869 "GS": "uint16", 2870 "FGM": "uint16", 2871 "FGA": "uint16", 2872 "3PM": "uint16", 2873 "3PA": "uint16", 2874 "FT": "uint16", 2875 "FTA": "uint16", 2876 "PTS": "uint16", 2877 "ORB": "uint16", 2878 "DRB": "uint16", 2879 "TRB": "uint16", 2880 "AST": "uint16", 2881 "TOV": "uint16", 2882 "STL": "uint16", 2883 "BLK": "uint16", 2884 "PF": "uint16", 2885 "DBL_DBL": "uint16", 2886 "TRP_DBL": "uint16", 2887 # "school_id": "uint32", 2888 } 2889 ) 2890 2891 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 2892 ":", expand=True 2893 ) 2894 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2895 "MP_minutes", "MP_seconds" 2896 ]].fillna(0) 2897 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2898 "MP_minutes", "MP_seconds" 2899 ]].astype("uint16") 2900 stats_df["MP_total_seconds"] = ( 2901 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 2902 ) 2903 2904 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 2905 stats_df["FG%"] = stats_df["FG%"].round(4) 2906 2907 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 2908 stats_df["3P%"] = stats_df["3P%"].round(4) 2909 2910 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 2911 stats_df["FT%"] = stats_df["FT%"].round(4) 2912 2913 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 2914 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 2915 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 2916 stats_df["2P%"] = stats_df["2P%"].round(4) 2917 2918 stats_df["eFG%"] = ( 2919 ( 2920 stats_df["FGM"] + 2921 (stats_df["3PM"] * 0.5) 2922 ) / 2923 stats_df["FGA"] 2924 ) 2925 stats_df["eFG%"] = stats_df["eFG%"].round(4) 2926 2927 stats_df["TSA"] = ( 2928 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 2929 ) 2930 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 2931 stats_df["TS%"] = stats_df["TS%"].round(4) 2932 2933 stats_df["TOV%"] = ( 2934 stats_df["TOV"] / 2935 ( 2936 stats_df["FGA"] + 2937 (stats_df["FTA"] * 0.44) + 2938 stats_df["TOV"] 2939 ) 2940 ) 2941 stats_df["TOV%"] = stats_df["TOV%"].round(4) 2942 stats_df = stats_df.reindex( 2943 columns=stat_columns 2944 ) 2945 # print(stats_df.columns) 2946 stats_df.to_csv( 2947 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/" 2948 + "player_game_stats/" 2949 + f"{season}_{player_id}_player_game_stats.csv", 2950 index=False, 2951 ) 2952 return stats_df 2953 2954 2955def get_basketball_game_player_stats(game_id: int) -> pd.DataFrame: 2956 """ 2957 Given a valid game ID, 2958 this function will attempt to get all player game stats, if possible. 2959 2960 NOTE: Due to an issue with [stats.ncaa.org](stats.ncaa.org), 2961 full player game stats may not be loaded in through this function. 2962 2963 This is a known issue, however you should be able to get position 2964 data and starters information through this function 2965 2966 Parameters 2967 ---------- 2968 `game_id` (int, mandatory): 2969 Required argument. 2970 Specifies the game you want player game stats from. 2971 2972 Usage 2973 ---------- 2974 ```python 2975 2976 from ncaa_stats_py.basketball import get_basketball_game_player_stats 2977 2978 ######################################## 2979 # Men's Basketball # 2980 ######################################## 2981 2982 # Get the game stats of the 2983 # 2024 NCAA D1 Men's Basketball National Championship game. 2984 print( 2985 "Get the game stats of the " 2986 + "2024 NCAA D1 Men's Basketball National Championship game." 2987 ) 2988 df = get_basketball_game_player_stats(5254137) 2989 print(df) 2990 2991 # Get the game stats of a March Madness game on March 29th, 2024 2992 # between Duke and the Houston Cougars. 2993 print( 2994 "Get the game stats of a March Madness game on March 29th, 2024 " 2995 + "between Duke and the Houston Cougars." 2996 ) 2997 df = get_basketball_game_player_stats(5254126) 2998 print(df) 2999 3000 # Get the game stats of a St. Patrick's Day 3001 # game between the Duquesne Dukes and VCU Rams (D1). 3002 print( 3003 "Get the game stats of a St. Patrick's Day " 3004 + "game between the Duquesne Dukes and VCU Rams (D1)." 3005 ) 3006 df = get_basketball_game_player_stats(5252318) 3007 print(df) 3008 3009 # Get the game stats of a December 17th, 2023 3010 # game between the Barry Buccaneers and Findlay Oilers (D2). 3011 print( 3012 "Get the game stats of a December 17th, 2023 " 3013 + "game between the Barry Buccaneers and Findlay Oilers (D2)." 3014 ) 3015 df = get_basketball_game_player_stats(3960610) 3016 print(df) 3017 3018 # Get the game stats of a Valentine's Day 3019 # game between the Kalamazoo Hornets and the Trine Thunder (D2). 3020 print( 3021 "Get the game stats of a Valentine's Day " 3022 + "game between the Kalamazoo Hornets and the Trine Thunder (D2)." 3023 ) 3024 df = get_basketball_game_player_stats(3967963) 3025 print(df) 3026 3027 3028 ######################################## 3029 # Women's Basketball # 3030 ######################################## 3031 3032 # Get the game stats of the 3033 # 2024 NCAA D1 Women's Basketball National Championship game. 3034 print( 3035 "Get the game stats of the " 3036 + "2024 NCAA D1 Women's Basketball National Championship game" 3037 ) 3038 df = get_basketball_game_player_stats(5254137) 3039 print(df) 3040 3041 # Get the game stats of a March 3rd, 2024 3042 # game between Duke and the North Carolina Tar Heels. 3043 print( 3044 "Get the game stats of a March 3rd, 2024 " 3045 + "game between Duke and the North Carolina Tar Heels" 3046 ) 3047 df = get_basketball_game_player_stats(3984600) 3048 print(df) 3049 3050 # Get the game stats of a Thanksgiving Day 3051 # game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2). 3052 print( 3053 "Get the game stats of a Thanksgiving Day " 3054 + "game between the Sacred Heart Pioneers and " 3055 + "the P.R.-Mayaguez Janes (D2)." 3056 ) 3057 df = get_basketball_game_player_stats(3972687) 3058 print(df) 3059 3060 # Get the game stats of a January 21st, 2024 3061 # game between the Puget Sound Loggers 3062 # and the Whitworth Pirates (D3). 3063 print( 3064 "Get the game stats of a January 21st, 2024 " 3065 + "game between the Puget Sound Loggers and " 3066 + "the Whitworth Pirates (D3)." 3067 ) 3068 df = get_basketball_game_player_stats(3979051) 3069 print(df) 3070 ``` 3071 3072 Returns 3073 ---------- 3074 A pandas `DataFrame` object with player game stats in a given game. 3075 3076 """ 3077 load_from_cache = True 3078 3079 sport_id = "" 3080 season = 0 3081 3082 mbb_teams_df = load_basketball_teams(get_wbb_data=False) 3083 mbb_team_ids_arr = mbb_teams_df["team_id"].to_list() 3084 3085 wbb_teams_df = load_basketball_teams(get_wbb_data=True) 3086 wbb_team_ids_arr = wbb_teams_df["team_id"].to_list() 3087 3088 stats_df = pd.DataFrame() 3089 stats_df_arr = [] 3090 3091 temp_df = pd.DataFrame() 3092 home_dir = expanduser("~") 3093 home_dir = _format_folder_str(home_dir) 3094 3095 stat_columns = [ 3096 "season", 3097 "game_id", 3098 "team_id", 3099 "team_name", 3100 "player_id", 3101 "player_num", 3102 "player_full_name", 3103 "player_position", 3104 "GP", 3105 "GS", 3106 "MP_str", 3107 "MP_minutes", 3108 "MP_seconds", 3109 "MP_total_seconds", 3110 "FGM", 3111 "FGA", 3112 "FG%", 3113 "3PM", 3114 "3PA", 3115 "3P%", 3116 "2PM", 3117 "2PA", 3118 "2P%", 3119 "eFG%", 3120 "FT", 3121 "FTA", 3122 "FT%", 3123 "TSA", 3124 "TS%", 3125 "ORB", 3126 "DRB", 3127 "TRB", 3128 "AST", 3129 "STL", 3130 "BLK", 3131 "TOV", 3132 "TOV%", 3133 "PF", 3134 "TF", 3135 "PTS", 3136 "DQ", 3137 "DBL_DBL", 3138 "TRP_DBL", 3139 ] 3140 3141 url = f"https://stats.ncaa.org/contests/{game_id}/individual_stats" 3142 3143 if exists(f"{home_dir}/.ncaa_stats_py/"): 3144 pass 3145 else: 3146 mkdir(f"{home_dir}/.ncaa_stats_py/") 3147 3148 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 3149 pass 3150 else: 3151 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 3152 3153 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/"): 3154 pass 3155 else: 3156 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/") 3157 3158 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/"): 3159 pass 3160 else: 3161 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/") 3162 3163 if exists( 3164 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3165 + f"{game_id}_player_game_stats.csv" 3166 ): 3167 games_df = pd.read_csv( 3168 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3169 + f"{game_id}_player_game_stats.csv" 3170 ) 3171 games_df = games_df.infer_objects() 3172 file_mod_datetime = datetime.fromtimestamp( 3173 getmtime( 3174 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3175 + f"{game_id}_player_game_stats.csv" 3176 ) 3177 ) 3178 load_from_cache = True 3179 else: 3180 file_mod_datetime = datetime.today() 3181 load_from_cache = False 3182 3183 if exists(f"{home_dir}/.ncaa_stats_py/"): 3184 pass 3185 else: 3186 mkdir(f"{home_dir}/.ncaa_stats_py/") 3187 3188 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 3189 pass 3190 else: 3191 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 3192 3193 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/"): 3194 pass 3195 else: 3196 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/") 3197 3198 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/"): 3199 pass 3200 else: 3201 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/") 3202 3203 if exists( 3204 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3205 + f"{game_id}_player_game_stats.csv" 3206 ): 3207 games_df = pd.read_csv( 3208 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3209 + f"{game_id}_player_game_stats.csv" 3210 ) 3211 games_df = games_df.infer_objects() 3212 file_mod_datetime = datetime.fromtimestamp( 3213 getmtime( 3214 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3215 + f"{game_id}_player_game_stats.csv" 3216 ) 3217 ) 3218 load_from_cache = True 3219 else: 3220 logging.info("Could not find a WBB player game stats file") 3221 3222 now = datetime.today() 3223 3224 age = now - file_mod_datetime 3225 3226 if age.days >= 35: 3227 load_from_cache = False 3228 3229 if load_from_cache is True: 3230 return games_df 3231 3232 response = _get_webpage(url=url) 3233 soup = BeautifulSoup(response.text, features="lxml") 3234 3235 # table_data = soup.find_all( 3236 # "table", 3237 # {"class": "small_font dataTable table-bordered"} 3238 # )[1] 3239 table_boxes = soup.find_all("div", {"class": "card p-0 table-responsive"}) 3240 3241 for box in table_boxes: 3242 t_header = box.find( 3243 "div", {"class": "card-header"} 3244 ).find( 3245 "div", {"class": "row"} 3246 ) 3247 3248 t_header_str = t_header.text 3249 t_header_str = t_header_str.replace("Period Stats", "") 3250 t_header_str = t_header_str.replace("\n", "") 3251 t_header_str = t_header_str.strip() 3252 3253 team_id = t_header.find("a").get("href") 3254 team_id = team_id.replace("/teams", "") 3255 team_id = team_id.replace("/", "") 3256 team_id = int(team_id) 3257 3258 table_data = box.find( 3259 "table", 3260 {"class": "display dataTable small_font"} 3261 ) 3262 table_headers = box.find("thead").find_all("th") 3263 table_headers = [x.text for x in table_headers] 3264 3265 temp_t_rows = table_data.find("tbody") 3266 temp_t_rows = temp_t_rows.find_all("tr") 3267 3268 spec_stats_df = pd.DataFrame() 3269 spec_stats_df_arr = [] 3270 for t in temp_t_rows: 3271 # row_id = t.get("id") 3272 game_played = 1 3273 game_started = 0 3274 3275 try: 3276 player_id = t.find("a").get("href") 3277 player_id = player_id.replace("/players", "") 3278 player_id = player_id.replace("/player", "") 3279 player_id = player_id.replace("/", "") 3280 except Exception as e: 3281 logging.debug( 3282 "Could not replace player IDs. " + 3283 f"Full exception: `{e}`" 3284 ) 3285 3286 t_cells = t.find_all("td") 3287 p_name = t_cells[1].text.replace("\n", "") 3288 p_name = p_name.strip() 3289 3290 if t_header_str in p_name: 3291 continue 3292 elif p_name.lower() == "team": 3293 continue 3294 if "\xa0" in p_name: 3295 game_started = 0 3296 3297 t_cells = [x.text.strip() for x in t_cells] 3298 player_id = int(player_id) 3299 3300 temp_df = pd.DataFrame( 3301 data=[t_cells], 3302 columns=table_headers 3303 ) 3304 3305 duplicate_cols = temp_df.columns[temp_df.columns.duplicated()] 3306 temp_df.drop(columns=duplicate_cols, inplace=True) 3307 3308 temp_df["player_id"] = player_id 3309 temp_df["GP"] = game_played 3310 temp_df["GS"] = game_started 3311 3312 spec_stats_df_arr.append(temp_df) 3313 del temp_df 3314 3315 spec_stats_df = pd.concat( 3316 spec_stats_df_arr, 3317 ignore_index=True 3318 ) 3319 3320 if team_id in mbb_team_ids_arr: 3321 sport_id = "MBB" 3322 df = mbb_teams_df[mbb_teams_df["team_id"] == team_id] 3323 season = df["season"].iloc[0] 3324 elif team_id in wbb_team_ids_arr: 3325 sport_id = "WBB" 3326 df = wbb_teams_df[wbb_teams_df["team_id"] == team_id] 3327 season = df["season"].iloc[0] 3328 else: 3329 raise ValueError( 3330 f"Unhandled team ID {team_id}" 3331 ) 3332 spec_stats_df["team_id"] = team_id 3333 spec_stats_df["team_name"] = t_header_str 3334 stats_df_arr.append(spec_stats_df) 3335 del spec_stats_df 3336 3337 stats_df = pd.concat(stats_df_arr) 3338 stats_df["season"] = season 3339 stats_df.rename( 3340 columns={ 3341 "#": "player_num", 3342 "Name": "player_full_name", 3343 "P": "player_position", 3344 "MP": "MP_str", 3345 "3FG": "3PM", 3346 "3FGA": "3PA", 3347 "ORebs": "ORB", 3348 "DRebs": "DRB", 3349 "TotReb": "TRB", 3350 "TO": "TOV", 3351 "TechFouls": "TF", 3352 "Fouls": "PF" 3353 }, 3354 inplace=True, 3355 ) 3356 3357 if "GS" not in stats_df.columns: 3358 stats_df["GS"] = None 3359 3360 if "DQ" not in stats_df.columns: 3361 stats_df["DQ"] = None 3362 3363 if "TF" not in stats_df.columns: 3364 stats_df["TF"] = None 3365 3366 if "DBL_DBL" not in stats_df.columns: 3367 stats_df["DBL_DBL"] = None 3368 3369 if "TRP_DBL" not in stats_df.columns: 3370 stats_df["TRP_DBL"] = None 3371 3372 stats_df = stats_df.astype( 3373 { 3374 "DQ": "uint16", 3375 "TF": "uint16", 3376 }, 3377 errors="ignore" 3378 ) 3379 3380 stats_df = stats_df.infer_objects().fillna(0) 3381 stats_df = stats_df.astype( 3382 { 3383 "GP": "uint16", 3384 "GS": "uint16", 3385 "FGM": "uint16", 3386 "FGA": "uint16", 3387 "3PM": "uint16", 3388 "3PA": "uint16", 3389 "FT": "uint16", 3390 "FTA": "uint16", 3391 "PTS": "uint16", 3392 "ORB": "uint16", 3393 "DRB": "uint16", 3394 "TRB": "uint16", 3395 "AST": "uint16", 3396 "TOV": "uint16", 3397 "STL": "uint16", 3398 "BLK": "uint16", 3399 "PF": "uint16", 3400 "DBL_DBL": "uint16", 3401 "TRP_DBL": "uint16", 3402 # "school_id": "uint32", 3403 } 3404 ) 3405 3406 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 3407 ":", expand=True 3408 ) 3409 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 3410 "MP_minutes", "MP_seconds" 3411 ]].fillna(0) 3412 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 3413 "MP_minutes", "MP_seconds" 3414 ]].astype("uint16") 3415 stats_df["MP_total_seconds"] = ( 3416 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 3417 ) 3418 3419 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 3420 stats_df["FG%"] = stats_df["FG%"].round(4) 3421 3422 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 3423 stats_df["3P%"] = stats_df["3P%"].round(4) 3424 3425 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 3426 stats_df["FT%"] = stats_df["FT%"].round(4) 3427 3428 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 3429 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 3430 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 3431 stats_df["2P%"] = stats_df["2P%"].round(4) 3432 3433 stats_df["eFG%"] = ( 3434 ( 3435 stats_df["FGM"] + 3436 (stats_df["3PM"] * 0.5) 3437 ) / 3438 stats_df["FGA"] 3439 ) 3440 stats_df["eFG%"] = stats_df["eFG%"].round(4) 3441 3442 stats_df["TSA"] = ( 3443 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 3444 ) 3445 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 3446 stats_df["TS%"] = stats_df["TS%"].round(4) 3447 3448 stats_df["TOV%"] = ( 3449 stats_df["TOV"] / 3450 ( 3451 stats_df["FGA"] + 3452 (stats_df["FTA"] * 0.44) + 3453 stats_df["TOV"] 3454 ) 3455 ) 3456 stats_df["TOV%"] = stats_df["TOV%"].round(4) 3457 3458 double_double_stats = ["PTS", "TRB", "AST", "BLK", "STL"] 3459 stats_df["DBL_DBL"] = (stats_df[double_double_stats] >= 10).sum(1) >= 2 3460 stats_df["TRP_DBL"] = (stats_df[double_double_stats] >= 10).sum(1) >= 3 3461 3462 stats_df = stats_df.astype( 3463 { 3464 "DBL_DBL": "uint16", 3465 "TRP_DBL": "uint16", 3466 }, 3467 errors="ignore" 3468 ) 3469 stats_df = stats_df.reindex( 3470 columns=stat_columns 3471 ) 3472 stats_df["game_id"] = game_id 3473 # print(stats_df.columns) 3474 stats_df.to_csv( 3475 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/game_stats/player/" 3476 + f"{game_id}_player_game_stats.csv", 3477 index=False 3478 ) 3479 return stats_df 3480 3481 3482def get_basketball_game_team_stats(game_id: int) -> pd.DataFrame: 3483 """ 3484 Given a valid game ID, 3485 this function will attempt to get all team game stats, if possible. 3486 3487 NOTE: Due to an issue with [stats.ncaa.org](stats.ncaa.org), 3488 full team game stats may not be loaded in through this function. 3489 3490 This is a known issue, however you should be able to get position 3491 data and starters information through this function 3492 3493 Parameters 3494 ---------- 3495 `game_id` (int, mandatory): 3496 Required argument. 3497 Specifies the game you want team game stats from. 3498 3499 Usage 3500 ---------- 3501 ```python 3502 3503 from ncaa_stats_py.basketball import get_basketball_game_team_stats 3504 3505 ######################################## 3506 # Men's Basketball # 3507 ######################################## 3508 3509 # Get the game stats of the 3510 # 2024 NCAA D1 Men's Basketball National Championship game. 3511 print( 3512 "Get the game stats of the " 3513 + "2024 NCAA D1 Men's Basketball National Championship game." 3514 ) 3515 df = get_basketball_game_team_stats(5254137) 3516 print(df) 3517 3518 # Get the game stats of a March Madness game on March 29th, 2024 3519 # between Duke and the Houston Cougars. 3520 print( 3521 "Get the game stats of a March Madness game on March 29th, 2024 " 3522 + "between Duke and the Houston Cougars." 3523 ) 3524 df = get_basketball_game_team_stats(5254126) 3525 print(df) 3526 3527 # Get the game stats of a St. Patrick's Day 3528 # game between the Duquesne Dukes and VCU Rams (D1). 3529 print( 3530 "Get the game stats of a St. Patrick's Day " 3531 + "game between the Duquesne Dukes and VCU Rams (D1)." 3532 ) 3533 df = get_basketball_game_team_stats(5252318) 3534 print(df) 3535 3536 # Get the game stats of a December 17th, 2023 3537 # game between the Barry Buccaneers and Findlay Oilers (D2). 3538 print( 3539 "Get the game stats of a December 17th, 2023 " 3540 + "game between the Barry Buccaneers and Findlay Oilers (D2)." 3541 ) 3542 df = get_basketball_game_team_stats(3960610) 3543 print(df) 3544 3545 # Get the game stats of a Valentine's Day 3546 # game between the Kalamazoo Hornets and the Trine Thunder (D2). 3547 print( 3548 "Get the game stats of a Valentine's Day " 3549 + "game between the Kalamazoo Hornets and the Trine Thunder (D2)." 3550 ) 3551 df = get_basketball_game_team_stats(3967963) 3552 print(df) 3553 3554 3555 ######################################## 3556 # Women's Basketball # 3557 ######################################## 3558 3559 # Get the game stats of the 3560 # 2024 NCAA D1 Women's Basketball National Championship game. 3561 print( 3562 "Get the game stats of the " 3563 + "2024 NCAA D1 Women's Basketball National Championship game" 3564 ) 3565 df = get_basketball_game_team_stats(5254137) 3566 print(df) 3567 3568 # Get the game stats of a March 3rd, 2024 3569 # game between Duke and the North Carolina Tar Heels. 3570 print( 3571 "Get the game stats of a March 3rd, 2024 " 3572 + "game between Duke and the North Carolina Tar Heels" 3573 ) 3574 df = get_basketball_game_team_stats(3984600) 3575 print(df) 3576 3577 # Get the game stats of a Thanksgiving Day 3578 # game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2). 3579 print( 3580 "Get the game stats of a Thanksgiving Day " 3581 + "game between the Sacred Heart Pioneers and " 3582 + "the P.R.-Mayaguez Janes (D2)." 3583 ) 3584 df = get_basketball_game_team_stats(3972687) 3585 print(df) 3586 3587 # Get the game stats of a January 21st, 2024 3588 # game between the Puget Sound Loggers 3589 # and the Whitworth Pirates (D3). 3590 print( 3591 "Get the game stats of a January 21st, 2024 " 3592 + "game between the Puget Sound Loggers and " 3593 + "the Whitworth Pirates (D3)." 3594 ) 3595 df = get_basketball_game_team_stats(3979051) 3596 3597 ``` 3598 3599 Returns 3600 ---------- 3601 A pandas `DataFrame` object with team game stats in a given game. 3602 3603 """ 3604 df = get_basketball_game_player_stats(game_id=game_id) 3605 # print(df.columns) 3606 df = df.infer_objects() 3607 stats_df = df.groupby( 3608 ["season", "game_id", "team_id", "team_name"], 3609 as_index=False 3610 ).agg( 3611 { 3612 # "MP_minutes": "sum", 3613 # "MP_seconds": "sum", 3614 "MP_total_seconds": "sum", 3615 "FGM": "sum", 3616 "FGA": "sum", 3617 "3PM": "sum", 3618 "3PA": "sum", 3619 "2PM": "sum", 3620 "2PA": "sum", 3621 "FT": "sum", 3622 "FTA": "sum", 3623 "ORB": "sum", 3624 "DRB": "sum", 3625 "TRB": "sum", 3626 "AST": "sum", 3627 "STL": "sum", 3628 "BLK": "sum", 3629 "TOV": "sum", 3630 "PF": "sum", 3631 "TF": "sum", 3632 "PTS": "sum", 3633 "DQ": "sum", 3634 "DBL_DBL": "sum", 3635 "TRP_DBL": "sum", 3636 } 3637 ) 3638 stats_df["MP_str"] = stats_df["MP_total_seconds"].map( 3639 _get_minute_formatted_time_from_seconds 3640 ) 3641 3642 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 3643 stats_df["FG%"] = stats_df["FG%"].round(4) 3644 3645 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 3646 stats_df["3P%"] = stats_df["3P%"].round(4) 3647 3648 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 3649 stats_df["FT%"] = stats_df["FT%"].round(4) 3650 3651 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 3652 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 3653 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 3654 stats_df["2P%"] = stats_df["2P%"].round(4) 3655 3656 stats_df["eFG%"] = ( 3657 ( 3658 stats_df["FGM"] + 3659 (stats_df["3PM"] * 0.5) 3660 ) / 3661 stats_df["FGA"] 3662 ) 3663 stats_df["eFG%"] = stats_df["eFG%"].round(4) 3664 3665 stats_df["TSA"] = ( 3666 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 3667 ) 3668 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 3669 stats_df["TS%"] = stats_df["TS%"].round(4) 3670 3671 stats_df["TOV%"] = ( 3672 stats_df["TOV"] / 3673 ( 3674 stats_df["FGA"] + 3675 (stats_df["FTA"] * 0.44) + 3676 stats_df["TOV"] 3677 ) 3678 ) 3679 stats_df["TOV%"] = stats_df["TOV%"].round(4) 3680 3681 return stats_df 3682 3683 3684def get_basketball_raw_pbp(game_id: int) -> pd.DataFrame: 3685 """ 3686 Given a valid game ID, 3687 this function will attempt to get the raw play-by-play (PBP) 3688 data for that game. 3689 3690 Parameters 3691 ---------- 3692 `game_id` (int, mandatory): 3693 Required argument. 3694 Specifies the game you want play-by-play data (PBP) from. 3695 3696 Usage 3697 ---------- 3698 ```python 3699 3700 from ncaa_stats_py.basketball import get_basketball_raw_pbp 3701 3702 ######################################## 3703 # Men's Basketball # 3704 ######################################## 3705 3706 # Get the play-by-play data of the 3707 # 2024 NCAA D1 Men's Basketball National Championship game. 3708 print( 3709 "Get the play-by-play data of the " 3710 + "2024 NCAA D1 Men's Basketball National Championship game." 3711 ) 3712 df = get_basketball_raw_pbp(5254137) 3713 print(df) 3714 3715 # Get the play-by-play data of a March Madness game on March 29th, 2024 3716 # between Duke and the Houston Cougars. 3717 print( 3718 "Get the play-by-play data " 3719 + "of a March Madness game on March 29th, 2024 " 3720 + "between Duke and the Houston Cougars." 3721 ) 3722 df = get_basketball_raw_pbp(5254126) 3723 print(df) 3724 3725 # Get the play-by-play data of a February 28th 3726 # game between the Winthrop Eagles and High Point Panthers. 3727 print( 3728 "Get the play-by-play data of a February 28th " 3729 + "game between the Winthrop Eagles and High Point Panthers." 3730 ) 3731 df = get_basketball_raw_pbp(3969302) 3732 print(df) 3733 3734 # Get the play-by-play data of a December 19th, 2022 3735 # game between the San Francisco St. Gators and 3736 # the Cal St. Monterey Bay Otters (D2). 3737 print( 3738 "Get the play-by-play data of a December 19th, 2022 " 3739 + "game between the San Francisco St. Gators and " + 3740 "the Cal St. Monterey Bay Otters (D2)." 3741 ) 3742 df = get_basketball_raw_pbp(2341500) 3743 print(df) 3744 3745 # Get the play-by-play data of a January 3rd, 2022 3746 # game between the Hamline Pipers and the St. Olaf Oles (D3). 3747 print( 3748 "Get the play-by-play data of a January 3rd, 2022 " 3749 + "game between the Hamline Pipers and the St. Olaf Oles (D3)." 3750 ) 3751 df = get_basketball_raw_pbp(3967963) 3752 print(df) 3753 3754 3755 ######################################## 3756 # Women's Basketball # 3757 ######################################## 3758 3759 # Get the play-by-play data of the 3760 # 2024 NCAA D1 Women's Basketball National Championship game. 3761 print( 3762 "Get the play-by-play data of the " 3763 + "2024 NCAA D1 Women's Basketball National Championship game." 3764 ) 3765 df = get_basketball_raw_pbp(5254137) 3766 print(df) 3767 3768 # Get the play-by-play data of a March 12th, 2021 3769 # game between the La Salle Explorers and the Dayton Flyers. 3770 print( 3771 "Get the play-by-play data of a March 12th, 2021 " 3772 + "game between the La Salle Explorers and the Dayton Flyers." 3773 ) 3774 df = get_basketball_raw_pbp(2055636) 3775 print(df) 3776 3777 # Get the play-by-play data of a February 6th, 2020 3778 # game between Purdue Northwest and the Michigan Tech Huskies (D2). 3779 print( 3780 "Get the play-by-play data of a Thanksgiving Day " 3781 + "game between the Sacred Heart Pioneers and " 3782 + "the P.R.-Mayaguez Janes (D2)." 3783 ) 3784 df = get_basketball_raw_pbp(1793405) 3785 print(df) 3786 3787 # Get the play-by-play data of a January 5th, 2019 3788 # game between the Puget Sound Loggers 3789 # and the Whitworth Pirates (D3). 3790 print( 3791 "Get the play-by-play data of a January 5th, 2019 " 3792 + "game between the Simpson Storm and " 3793 + "the Dubuque Spartans (D3)." 3794 ) 3795 df = get_basketball_raw_pbp(1625974) 3796 print(df) 3797 3798 ``` 3799 3800 Returns 3801 ---------- 3802 A pandas `DataFrame` object with a play-by-play (PBP) data in a given game. 3803 3804 """ 3805 load_from_cache = True 3806 is_overtime = False 3807 3808 sport_id = "" 3809 season = 0 3810 away_score = 0 3811 home_score = 0 3812 3813 mbb_teams_df = load_basketball_teams(get_wbb_data=False) 3814 mbb_team_ids_arr = mbb_teams_df["team_id"].to_list() 3815 3816 wbb_teams_df = load_basketball_teams(get_wbb_data=True) 3817 wbb_team_ids_arr = wbb_teams_df["team_id"].to_list() 3818 3819 pbp_df = pd.DataFrame() 3820 pbp_df_arr = [] 3821 temp_df = pd.DataFrame() 3822 3823 temp_df = pd.DataFrame() 3824 home_dir = expanduser("~") 3825 home_dir = _format_folder_str(home_dir) 3826 3827 stat_columns = [ 3828 "season", 3829 "game_id", 3830 "sport_id", 3831 "game_datetime", 3832 "half_num", 3833 "event_num", 3834 "game_time_str", 3835 "game_time_seconds", 3836 "game_time_milliseconds", 3837 "event_team", 3838 "event_text", 3839 "is_overtime", 3840 "stadium_name", 3841 "attendance", 3842 "away_team_id", 3843 "away_team_name", 3844 "home_team_id", 3845 "home_team_name", 3846 ] 3847 3848 url = f"https://stats.ncaa.org/contests/{game_id}/play_by_play" 3849 3850 if exists(f"{home_dir}/.ncaa_stats_py/"): 3851 pass 3852 else: 3853 mkdir(f"{home_dir}/.ncaa_stats_py/") 3854 3855 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 3856 pass 3857 else: 3858 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 3859 3860 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/"): 3861 pass 3862 else: 3863 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/") 3864 3865 if exists( 3866 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3867 + f"{game_id}_raw_pbp.csv" 3868 ): 3869 games_df = pd.read_csv( 3870 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3871 + f"{game_id}_raw_pbp.csv" 3872 ) 3873 games_df = games_df.infer_objects() 3874 file_mod_datetime = datetime.fromtimestamp( 3875 getmtime( 3876 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3877 + f"{game_id}_raw_pbp.csv" 3878 ) 3879 ) 3880 load_from_cache = True 3881 else: 3882 file_mod_datetime = datetime.today() 3883 load_from_cache = False 3884 3885 if exists(f"{home_dir}/.ncaa_stats_py/"): 3886 pass 3887 else: 3888 mkdir(f"{home_dir}/.ncaa_stats_py/") 3889 3890 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 3891 pass 3892 else: 3893 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 3894 3895 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/"): 3896 pass 3897 else: 3898 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/") 3899 3900 if exists( 3901 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3902 + f"{game_id}_raw_pbp.csv" 3903 ): 3904 games_df = pd.read_csv( 3905 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3906 + f"{game_id}_raw_pbp.csv" 3907 ) 3908 games_df = games_df.infer_objects() 3909 file_mod_datetime = datetime.fromtimestamp( 3910 getmtime( 3911 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3912 + f"{game_id}_raw_pbp.csv" 3913 ) 3914 ) 3915 load_from_cache = True 3916 else: 3917 logging.info("Could not find a WBB player game stats file") 3918 3919 now = datetime.today() 3920 3921 age = now - file_mod_datetime 3922 3923 if age.days >= 35: 3924 load_from_cache = False 3925 3926 if load_from_cache is True: 3927 return games_df 3928 3929 response = _get_webpage(url=url) 3930 soup = BeautifulSoup(response.text, features="lxml") 3931 3932 info_table = soup.find( 3933 "td", 3934 { 3935 "style": "padding: 0px 30px 0px 30px", 3936 "class": "d-none d-md-table-cell" 3937 } 3938 ).find( 3939 "table", 3940 {"style": "border-collapse: collapse"} 3941 ) 3942 3943 info_table_rows = info_table.find_all("tr") 3944 3945 game_date_str = info_table_rows[3].find("td").text 3946 if "TBA" in game_date_str: 3947 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBA') 3948 elif "tba" in game_date_str: 3949 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tba') 3950 elif "TBD" in game_date_str: 3951 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBD') 3952 elif "tbd" in game_date_str: 3953 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tbd') 3954 elif ( 3955 "tbd" not in game_date_str.lower() and 3956 ":" not in game_date_str.lower() 3957 ): 3958 game_date_str = game_date_str.replace(" ", "") 3959 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y') 3960 else: 3961 game_datetime = datetime.strptime( 3962 game_date_str, 3963 '%m/%d/%Y %I:%M %p' 3964 ) 3965 game_datetime = game_datetime.astimezone(timezone("US/Eastern")) 3966 game_date_str = game_datetime.isoformat() 3967 del game_datetime 3968 3969 stadium_str = info_table_rows[4].find("td").text 3970 3971 attendance_str = info_table_rows[5].find("td").text 3972 attendance_int = re.findall( 3973 r"([0-9\,]+)", 3974 attendance_str 3975 )[0] 3976 attendance_int = attendance_int.replace(",", "") 3977 attendance_int = int(attendance_int) 3978 3979 del attendance_str 3980 team_cards = soup.find_all( 3981 "td", 3982 { 3983 "valign": "center", 3984 "class": "grey_text d-none d-sm-table-cell" 3985 } 3986 ) 3987 3988 away_url = team_cards[0].find_all("a") 3989 away_url = away_url[0] 3990 home_url = team_cards[1].find_all("a") 3991 home_url = home_url[0] 3992 3993 away_team_name = away_url.text 3994 home_team_name = home_url.text 3995 3996 away_team_id = away_url.get("href") 3997 home_team_id = home_url.get("href") 3998 3999 away_team_id = away_team_id.replace("/teams", "") 4000 away_team_id = away_team_id.replace("/team", "") 4001 away_team_id = away_team_id.replace("/", "") 4002 away_team_id = int(away_team_id) 4003 4004 home_team_id = home_team_id.replace("/teams", "") 4005 home_team_id = home_team_id.replace("/team", "") 4006 home_team_id = home_team_id.replace("/", "") 4007 home_team_id = int(home_team_id) 4008 4009 if home_team_id in mbb_team_ids_arr: 4010 sport_id = "MBB" 4011 temp_df = mbb_teams_df[mbb_teams_df["team_id"] == home_team_id] 4012 season = temp_df["season"].iloc[0] 4013 del temp_df 4014 elif home_team_id in wbb_team_ids_arr: 4015 sport_id = "WBB" 4016 temp_df = wbb_teams_df[wbb_teams_df["team_id"] == home_team_id] 4017 season = temp_df["season"].iloc[0] 4018 del temp_df 4019 # This should never be the case, 4020 # but if something goes very horribly wrong, 4021 # double check the away team ID to 4022 # the MBB and WBB team ID list. 4023 elif away_team_id in mbb_team_ids_arr: 4024 sport_id = "MBB" 4025 temp_df = mbb_teams_df[mbb_teams_df["team_id"] == away_team_id] 4026 season = temp_df["season"].iloc[0] 4027 del temp_df 4028 elif away_team_id in wbb_team_ids_arr: 4029 sport_id = "WBB" 4030 temp_df = wbb_teams_df[wbb_teams_df["team_id"] == home_team_id] 4031 season = temp_df["season"].iloc[0] 4032 del temp_df 4033 # If we get to this, we are in a code red situation. 4034 # "SHUT IT DOWN" - Gordon Ramsay 4035 else: 4036 raise ValueError( 4037 "Could not identify if this is a " + 4038 "MBB or WBB game based on team IDs. " 4039 ) 4040 4041 section_cards = soup.find_all( 4042 "div", 4043 {"class": "row justify-content-md-center w-100"} 4044 ) 4045 4046 for card in section_cards: 4047 # top_bot = "" 4048 event_text = "" 4049 half_str = card.find( 4050 "div", 4051 {"class": "card-header"} 4052 ).text 4053 half_num = re.findall( 4054 r"([0-9]+)", 4055 half_str 4056 ) 4057 4058 half_num = int(half_num[0]) 4059 if "ot" in half_str.lower(): 4060 is_overtime = True 4061 half_num += 2 4062 table_body = card.find("table").find("tbody").find_all("tr") 4063 4064 for row in table_body: 4065 t_cells = row.find_all("td") 4066 t_cells = [x.text.strip() for x in t_cells] 4067 game_time_str = t_cells[0] 4068 4069 if len(t_cells[1]) > 0: 4070 event_team = away_team_id 4071 event_text = t_cells[1] 4072 elif len(t_cells[3]) > 0: 4073 event_team = home_team_id 4074 event_text = t_cells[3] 4075 4076 if t_cells[1].lower() == "game start": 4077 pass 4078 elif t_cells[1].lower() == "jumpball startperiod": 4079 pass 4080 elif t_cells[1].lower() == "period start": 4081 pass 4082 elif t_cells[1].lower() == "period end confirmed;": 4083 pass 4084 elif t_cells[1].lower() == "period end confirmed": 4085 pass 4086 elif t_cells[1].lower() == "game end confirmed;": 4087 pass 4088 elif t_cells[1].lower() == "game end confirmed": 4089 pass 4090 elif t_cells[1].lower() == "timeout commercial": 4091 pass 4092 else: 4093 away_score, home_score = t_cells[2].split("-") 4094 4095 away_score = int(away_score) 4096 home_score = int(home_score) 4097 if len(game_time_str.split(":")) == 3: 4098 temp_time_minutes, temp_time_seconds, game_time_ms = \ 4099 game_time_str.split(":") 4100 elif len(game_time_str.split(":")) == 2: 4101 temp_time_minutes, temp_time_seconds = \ 4102 game_time_str.split(":") 4103 game_time_ms = 0 4104 4105 temp_time_minutes = int(temp_time_minutes) 4106 temp_time_seconds = int(temp_time_seconds) 4107 game_time_ms = int(game_time_ms) 4108 game_time_seconds = temp_time_seconds + (temp_time_minutes * 60) 4109 4110 if half_num == 1: 4111 half_seconds_remaining = game_time_seconds 4112 half_ms_remaining = game_time_ms 4113 4114 game_time_seconds += 1200 4115 else: 4116 half_seconds_remaining = game_time_seconds 4117 half_ms_remaining = game_time_ms 4118 4119 temp_df = pd.DataFrame( 4120 { 4121 # "season": season, 4122 # "game_id": game_id, 4123 # "sport_id": sport_id, 4124 # "away_team_id": away_team_id, 4125 # "away_team_name": away_team_name, 4126 # "home_team_id": home_team_id, 4127 # "home_team_name": home_team_name, 4128 "game_time_str": game_time_str, 4129 "half_seconds_remaining": half_seconds_remaining, 4130 "half_milliseconds_remaining": half_ms_remaining, 4131 "game_seconds_remaining": game_time_seconds, 4132 "game_milliseconds_remaining": game_time_ms, 4133 "half_num": half_num, 4134 "event_team": event_team, 4135 "event_text": event_text, 4136 "is_overtime": is_overtime 4137 }, 4138 index=[0], 4139 ) 4140 pbp_df_arr.append(temp_df) 4141 4142 pbp_df = pd.concat(pbp_df_arr, ignore_index=True) 4143 pbp_df["event_num"] = pbp_df.index + 1 4144 pbp_df["game_datetime"] = game_date_str 4145 pbp_df["season"] = season 4146 pbp_df["game_id"] = game_id 4147 pbp_df["sport_id"] = sport_id 4148 pbp_df["stadium_name"] = stadium_str 4149 pbp_df["attendance"] = attendance_int 4150 pbp_df["away_team_id"] = away_team_id 4151 pbp_df["away_team_name"] = away_team_name 4152 pbp_df["home_team_id"] = home_team_id 4153 pbp_df["home_team_name"] = home_team_name 4154 4155 pbp_df = pbp_df.reindex(columns=stat_columns) 4156 pbp_df = pbp_df.infer_objects() 4157 4158 if sport_id == "MBB": 4159 pbp_df.to_csv( 4160 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 4161 + f"{game_id}_raw_pbp.csv", 4162 index=False 4163 ) 4164 elif sport_id == "WBB": 4165 pbp_df.to_csv( 4166 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 4167 + f"{game_id}_raw_pbp.csv", 4168 index=False 4169 ) 4170 else: 4171 raise ValueError( 4172 f"Improper Sport ID: `{sport_id}`" 4173 ) 4174 4175 return pbp_df 4176 4177 4178def get_basketball_game_starters(game_id: int) -> list: 4179 """ 4180 Given a valid game ID, this function will attempt to 4181 get the starting lineup out of the raw play-by-play data 4182 from the game. 4183 4184 NOTE #1: The layout of the list will be as follows: 4185 4186 > | Index | **Away players** | 4187 > | :---: | :------------------: | 4188 > | 0 | Away team starter #1 | 4189 > | 1 | Away team starter #2 | 4190 > | 2 | Away team starter #3 | 4191 > | 3 | Away team starter #4 | 4192 > | 4 | Away team starter #5 | 4193 4194 > | Index | **Home players** | 4195 > | :---: | :------------------: | 4196 > | 5 | Home team starter #1 | 4197 > | 6 | Home team starter #2 | 4198 > | 7 | Home team starter #3 | 4199 > | 8 | Home team starter #4 | 4200 > | 9 | Home team starter #5 | 4201 4202 NOTE #2: Starters are listed in order of when they first sub out. 4203 Do not assume that starter #5 for a team is a center, 4204 or that starter #1 is a PG! 4205 4206 Returns 4207 ---------- 4208 A list of starters from a specific basketball game ID. 4209 4210 """ 4211 starters_list = [] 4212 pbp_df = get_basketball_raw_pbp(game_id=game_id) 4213 away_team_id = pbp_df["away_team_id"].iloc[0] 4214 home_team_id = pbp_df["home_team_id"].iloc[0] 4215 # pointer_int = 0 4216 4217 for team_id in [away_team_id, home_team_id]: 4218 temp_starters_list = [] 4219 4220 temp_df = pbp_df[pbp_df["event_team"] == team_id] 4221 4222 play_text_list = temp_df["event_text"].to_list() 4223 4224 for play_txt in play_text_list: 4225 if len(temp_starters_list) == 5: 4226 break 4227 elif "substitution out" in play_txt: 4228 player_txt = play_txt.split(",")[0] 4229 if play_txt in temp_starters_list: 4230 pass 4231 elif player_txt.lower() == "team": 4232 pass 4233 elif (player_txt is None) or (len(player_txt) == 0): 4234 raise ValueError( 4235 "Player cannot be NULL." 4236 ) 4237 else: 4238 temp_starters_list.append(player_txt) 4239 4240 if len(temp_starters_list) < 5: 4241 raise ValueError( 4242 f"Could not find all 5 starters for team ID {team_id} " + 4243 f"in game ID {game_id}" 4244 ) 4245 for txt in temp_starters_list: 4246 starters_list.append(txt) 4247 return starters_list 4248 4249 4250def get_basketball_game_shot_locations(game_id: int) -> pd.DataFrame: 4251 """ """ 4252 raise NotImplementedError( 4253 "It's not implemented yet." 4254 )
37def get_basketball_teams( 38 season: int, 39 level: str | int, 40 get_wbb_data: bool = False 41) -> pd.DataFrame: 42 """ 43 Retrieves a list of basketball teams from the NCAA. 44 45 Parameters 46 ---------- 47 `season` (int, mandatory): 48 Required argument. 49 Specifies the season you want NCAA basketball team information from. 50 51 `level` (int, mandatory): 52 Required argument. 53 Specifies the level/division you want 54 NCAA basketball team information from. 55 This can either be an integer (1-3) or a string ("I"-"III"). 56 57 `get_wbb_data` (bool, optional): 58 Optional argument. 59 If you want women's basketball data instead of men's basketball data, 60 set this to `True`. 61 62 Usage 63 ---------- 64 ```python 65 66 from ncaa_stats_py.basketball import get_basketball_teams 67 68 ######################################## 69 # Men's Basketball # 70 ######################################## 71 72 # Get all D1 men's basketball teams for the 2024 season. 73 print("Get all D1 men's basketball teams for the 2024 season.") 74 df = get_basketball_teams(2024, 1) 75 print(df) 76 77 # Get all D2 men's basketball teams for the 2023 season. 78 print("Get all D2 men's basketball teams for the 2023 season.") 79 df = get_basketball_teams(2023, 2) 80 print(df) 81 82 # Get all D3 men's basketball teams for the 2022 season. 83 print("Get all D3 men's basketball teams for the 2022 season.") 84 df = get_basketball_teams(2022, 3) 85 print(df) 86 87 # Get all D1 men's basketball teams for the 2021 season. 88 print("Get all D1 men's basketball teams for the 2021 season.") 89 df = get_basketball_teams(2021, "I") 90 print(df) 91 92 # Get all D2 men's basketball teams for the 2020 season. 93 print("Get all D2 men's basketball teams for the 2020 season.") 94 df = get_basketball_teams(2020, "II") 95 print(df) 96 97 # Get all D3 men's basketball teams for the 2019 season. 98 print("Get all D3 men's basketball teams for the 2019 season.") 99 df = get_basketball_teams(2019, "III") 100 print(df) 101 102 ######################################## 103 # Women's Basketball # 104 ######################################## 105 106 # Get all D1 women's basketball teams for the 2024 season. 107 print( 108 "Get all D1 women's basketball teams for the 2024 season." 109 ) 110 df = get_basketball_teams(2024, 1) 111 print(df) 112 113 # Get all D2 women's basketball teams for the 2023 season. 114 print( 115 "Get all D2 women's basketball teams for the 2023 season." 116 ) 117 df = get_basketball_teams(2023, 2) 118 print(df) 119 120 # Get all D3 women's basketball teams for the 2022 season. 121 print( 122 "Get all D3 women's basketball teams for the 2022 season." 123 ) 124 df = get_basketball_teams(2022, 3) 125 print(df) 126 127 # Get all D1 women's basketball teams for the 2021 season. 128 print( 129 "Get all D1 women's basketball teams for the 2021 season." 130 ) 131 df = get_basketball_teams(2021, "I") 132 print(df) 133 134 # Get all D2 women's basketball teams for the 2020 season. 135 print( 136 "Get all D2 women's basketball teams for the 2020 season." 137 ) 138 df = get_basketball_teams(2020, "II") 139 print(df) 140 141 # Get all D3 women's basketball teams for the 2019 season. 142 print( 143 "Get all D3 women's basketball teams for the 2019 season." 144 ) 145 df = get_basketball_teams(2019, "III") 146 print(df) 147 148 ``` 149 150 Returns 151 ---------- 152 A pandas `DataFrame` object with a list of college basketball teams 153 in that season and NCAA level. 154 """ 155 # def is_comment(elem): 156 # return isinstance(elem, Comment) 157 sport_id = "" 158 # stat_sequence = 0 159 load_from_cache = True 160 home_dir = expanduser("~") 161 home_dir = _format_folder_str(home_dir) 162 teams_df = pd.DataFrame() 163 teams_df_arr = [] 164 temp_df = pd.DataFrame() 165 formatted_level = "" 166 ncaa_level = 0 167 168 if get_wbb_data is True: 169 sport_id = "WBB" 170 stat_sequence = 169 171 else: 172 sport_id = "MBB" 173 stat_sequence = 168 174 175 if isinstance(level, int) and level == 1: 176 formatted_level = "I" 177 ncaa_level = 1 178 elif isinstance(level, int) and level == 2: 179 formatted_level = "II" 180 ncaa_level = 2 181 elif isinstance(level, int) and level == 3: 182 formatted_level = "III" 183 ncaa_level = 3 184 elif isinstance(level, str) and ( 185 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 186 ): 187 ncaa_level = 1 188 formatted_level = level.upper() 189 elif isinstance(level, str) and ( 190 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 191 ): 192 ncaa_level = 2 193 formatted_level = level.upper() 194 elif isinstance(level, str) and ( 195 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 196 ): 197 ncaa_level = 3 198 formatted_level = level.upper() 199 200 if exists(f"{home_dir}/.ncaa_stats_py/"): 201 pass 202 else: 203 mkdir(f"{home_dir}/.ncaa_stats_py/") 204 205 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 206 pass 207 else: 208 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 209 210 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/"): 211 pass 212 else: 213 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}//teams/") 214 215 if exists( 216 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 217 + f"{season}_{formatted_level}_teams.csv" 218 ): 219 teams_df = pd.read_csv( 220 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 221 + f"{season}_{formatted_level}_teams.csv" 222 ) 223 file_mod_datetime = datetime.fromtimestamp( 224 getmtime( 225 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 226 + f"{season}_{formatted_level}_teams.csv" 227 ) 228 ) 229 else: 230 file_mod_datetime = datetime.today() 231 load_from_cache = False 232 233 now = datetime.today() 234 235 age = now - file_mod_datetime 236 237 if ( 238 age.days > 1 and 239 season >= (now.year - 1) and 240 now.month <= 7 241 ): 242 load_from_cache = False 243 elif age.days >= 35: 244 load_from_cache = False 245 246 if load_from_cache is True: 247 return teams_df 248 249 logging.warning( 250 f"Either we could not load {season} D{level} schools from cache, " 251 + "or it's time to refresh the cached data." 252 ) 253 schools_df = _get_schools() 254 url = ( 255 "https://stats.ncaa.org/rankings/change_sport_year_div?" 256 + f"academic_year={season}.0&division={ncaa_level}.0" + 257 f"&sport_code={sport_id}" 258 ) 259 260 response = _get_webpage(url=url) 261 262 soup = BeautifulSoup(response.text, features="lxml") 263 ranking_periods = soup.find("select", {"name": "rp", "id": "rp"}) 264 ranking_periods = ranking_periods.find_all("option") 265 266 rp_value = 0 267 found_value = False 268 269 while found_value is False: 270 # print("check") 271 for rp in ranking_periods: 272 if "final " in rp.text.lower(): 273 rp_value = rp.get("value") 274 found_value = True 275 break 276 else: 277 rp_value = rp.get("value") 278 found_value = True 279 break 280 281 url = ( 282 "https://stats.ncaa.org/rankings/institution_trends?" 283 + f"academic_year={season}.0&division={ncaa_level}.0&" 284 + f"ranking_period={rp_value}&sport_code={sport_id}" 285 + f"&sport_code={sport_id}" 286 ) 287 288 best_method = True 289 if ( 290 (season < 2015 and sport_id == "MBB") 291 ): 292 url = ( 293 "https://stats.ncaa.org/rankings/national_ranking?" 294 + f"academic_year={season}.0&division={ncaa_level}.0&" 295 + f"ranking_period={rp_value}&sport_code={sport_id}" 296 + f"&stat_seq={stat_sequence}" 297 ) 298 response = _get_webpage(url=url) 299 best_method = False 300 elif season < 2013: 301 url = ( 302 "https://stats.ncaa.org/rankings/national_ranking?" 303 + f"academic_year={season}.0&division={ncaa_level}.0&" 304 + f"ranking_period={rp_value}&sport_code={sport_id}" 305 + f"&stat_seq={stat_sequence}" 306 ) 307 response = _get_webpage(url=url) 308 best_method = False 309 else: 310 try: 311 response = _get_webpage(url=url) 312 except Exception as e: 313 logging.info(f"Found exception when loading teams `{e}`") 314 logging.info("Attempting backup method.") 315 url = ( 316 "https://stats.ncaa.org/rankings/national_ranking?" 317 + f"academic_year={season}.0&division={ncaa_level}.0&" 318 + f"ranking_period={rp_value}&sport_code={sport_id}" 319 + f"&stat_seq={stat_sequence}" 320 ) 321 response = _get_webpage(url=url) 322 best_method = False 323 324 soup = BeautifulSoup(response.text, features="lxml") 325 326 if best_method is True: 327 soup = soup.find( 328 "table", 329 {"id": "stat_grid"}, 330 ) 331 soup = soup.find("tbody") 332 t_rows = soup.find_all("tr") 333 334 for t in t_rows: 335 team_id = t.find("a") 336 team_id = team_id.get("href") 337 team_id = team_id.replace("/teams/", "") 338 team_id = int(team_id) 339 team_name = t.find_all("td")[0].text 340 team_conference_name = t.find_all("td")[1].text 341 # del team 342 temp_df = pd.DataFrame( 343 { 344 "season": season, 345 "ncaa_division": ncaa_level, 346 "ncaa_division_formatted": formatted_level, 347 "team_conference_name": team_conference_name, 348 "team_id": team_id, 349 "school_name": team_name, 350 "sport_id": sport_id, 351 }, 352 index=[0], 353 ) 354 teams_df_arr.append(temp_df) 355 del temp_df 356 else: 357 soup = soup.find( 358 "table", 359 {"id": "rankings_table"}, 360 ) 361 soup = soup.find("tbody") 362 t_rows = soup.find_all("tr") 363 364 for t in t_rows: 365 team_id = t.find("a") 366 team_id = team_id.get("href") 367 team_id = team_id.replace("/teams/", "") 368 team_id = int(team_id) 369 team = t.find_all("td")[1].get("data-order") 370 team_name, team_conference_name = team.split(",") 371 del team 372 temp_df = pd.DataFrame( 373 { 374 "season": season, 375 "ncaa_division": ncaa_level, 376 "ncaa_division_formatted": formatted_level, 377 "team_conference_name": team_conference_name, 378 "team_id": team_id, 379 "school_name": team_name, 380 "sport_id": sport_id, 381 }, 382 index=[0], 383 ) 384 teams_df_arr.append(temp_df) 385 del temp_df 386 387 teams_df = pd.concat(teams_df_arr, ignore_index=True) 388 teams_df = pd.merge( 389 left=teams_df, 390 right=schools_df, 391 on=["school_name"], 392 how="left" 393 ) 394 teams_df.sort_values(by=["team_id"], inplace=True) 395 396 teams_df.to_csv( 397 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/teams/" 398 + f"{season}_{formatted_level}_teams.csv", 399 index=False, 400 ) 401 402 return teams_df
Retrieves a list of basketball teams from the NCAA.
Parameters
season
(int, mandatory):
Required argument.
Specifies the season you want NCAA basketball team information from.
level
(int, mandatory):
Required argument.
Specifies the level/division you want
NCAA basketball team information from.
This can either be an integer (1-3) or a string ("I"-"III").
get_wbb_data
(bool, optional):
Optional argument.
If you want women's basketball data instead of men's basketball data,
set this to True
.
Usage
from ncaa_stats_py.basketball import get_basketball_teams
########################################
# Men's Basketball #
########################################
# Get all D1 men's basketball teams for the 2024 season.
print("Get all D1 men's basketball teams for the 2024 season.")
df = get_basketball_teams(2024, 1)
print(df)
# Get all D2 men's basketball teams for the 2023 season.
print("Get all D2 men's basketball teams for the 2023 season.")
df = get_basketball_teams(2023, 2)
print(df)
# Get all D3 men's basketball teams for the 2022 season.
print("Get all D3 men's basketball teams for the 2022 season.")
df = get_basketball_teams(2022, 3)
print(df)
# Get all D1 men's basketball teams for the 2021 season.
print("Get all D1 men's basketball teams for the 2021 season.")
df = get_basketball_teams(2021, "I")
print(df)
# Get all D2 men's basketball teams for the 2020 season.
print("Get all D2 men's basketball teams for the 2020 season.")
df = get_basketball_teams(2020, "II")
print(df)
# Get all D3 men's basketball teams for the 2019 season.
print("Get all D3 men's basketball teams for the 2019 season.")
df = get_basketball_teams(2019, "III")
print(df)
########################################
# Women's Basketball #
########################################
# Get all D1 women's basketball teams for the 2024 season.
print(
"Get all D1 women's basketball teams for the 2024 season."
)
df = get_basketball_teams(2024, 1)
print(df)
# Get all D2 women's basketball teams for the 2023 season.
print(
"Get all D2 women's basketball teams for the 2023 season."
)
df = get_basketball_teams(2023, 2)
print(df)
# Get all D3 women's basketball teams for the 2022 season.
print(
"Get all D3 women's basketball teams for the 2022 season."
)
df = get_basketball_teams(2022, 3)
print(df)
# Get all D1 women's basketball teams for the 2021 season.
print(
"Get all D1 women's basketball teams for the 2021 season."
)
df = get_basketball_teams(2021, "I")
print(df)
# Get all D2 women's basketball teams for the 2020 season.
print(
"Get all D2 women's basketball teams for the 2020 season."
)
df = get_basketball_teams(2020, "II")
print(df)
# Get all D3 women's basketball teams for the 2019 season.
print(
"Get all D3 women's basketball teams for the 2019 season."
)
df = get_basketball_teams(2019, "III")
print(df)
Returns
A pandas DataFrame
object with a list of college basketball teams
in that season and NCAA level.
405def load_basketball_teams( 406 start_year: int = 2011, 407 get_wbb_data: bool = False 408) -> pd.DataFrame: 409 """ 410 Compiles a list of known NCAA basketball teams in NCAA basketball history. 411 412 Parameters 413 ---------- 414 `start_year` (int, optional): 415 Optional argument. 416 Specifies the first season you want 417 NCAA basketball team information from. 418 419 `get_wbb_data` (bool, optional): 420 Optional argument. 421 If you want women's basketball data instead of men's basketball data, 422 set this to `True`. 423 424 Usage 425 ---------- 426 ```python 427 428 from ncaa_stats_py.basketball import load_basketball_teams 429 430 # WARNING: Running this script "as-is" for the first time may 431 # take some time. 432 # The *N*th time you run this script will be faster. 433 434 # Load in every women's basketball team 435 # from 2011 to present day. 436 print( 437 "Load in every women's basketball team " + 438 "from 2011 to present day." 439 ) 440 df = load_basketball_teams(get_wbb_data=True) 441 print(df) 442 443 # Load in every men's basketball team 444 # from 2011 to present day. 445 print( 446 "Load in every men's basketball team " + 447 "from 2011 to present day." 448 ) 449 df = load_basketball_teams() 450 print(df) 451 452 # Load in every men's basketball team 453 # from 2020 to present day. 454 print( 455 "Load in every men's basketball team " + 456 "from 2020 to present day." 457 ) 458 df = load_basketball_teams(start_year=2020) 459 print(df) 460 461 ``` 462 463 Returns 464 ---------- 465 A pandas `DataFrame` object with a list of 466 all known college basketball teams. 467 468 """ 469 # start_year = 2008 470 471 # if get_wbb_data is True: 472 # sport_id = "WBB" 473 # else: 474 # sport_id = "MBB" 475 476 teams_df = pd.DataFrame() 477 teams_df_arr = [] 478 temp_df = pd.DataFrame() 479 480 now = datetime.now() 481 ncaa_divisions = ["I", "II", "III"] 482 if now.month > 5: 483 ncaa_seasons = [x for x in range(start_year, (now.year + 2))] 484 else: 485 ncaa_seasons = [x for x in range(start_year, (now.year + 1))] 486 487 logging.info( 488 "Loading in all NCAA basketball teams. " 489 + "If this is the first time you're seeing this message, " 490 + "it may take some time (3-10 minutes) for this to load." 491 ) 492 for s in ncaa_seasons: 493 logging.info(f"Loading in basketball teams for the {s} season.") 494 for d in ncaa_divisions: 495 try: 496 temp_df = get_basketball_teams(season=s, level=d) 497 teams_df_arr.append(temp_df) 498 del temp_df 499 except Exception as e: 500 logging.warning( 501 "Unhandled exception when trying to " + 502 f"get the teams. Full exception: `{e}`" 503 ) 504 505 506 teams_df = pd.concat(teams_df_arr, ignore_index=True) 507 teams_df = teams_df.infer_objects() 508 return teams_df
Compiles a list of known NCAA basketball teams in NCAA basketball history.
Parameters
start_year
(int, optional):
Optional argument.
Specifies the first season you want
NCAA basketball team information from.
get_wbb_data
(bool, optional):
Optional argument.
If you want women's basketball data instead of men's basketball data,
set this to True
.
Usage
from ncaa_stats_py.basketball import load_basketball_teams
# WARNING: Running this script "as-is" for the first time may
# take some time.
# The *N*th time you run this script will be faster.
# Load in every women's basketball team
# from 2011 to present day.
print(
"Load in every women's basketball team " +
"from 2011 to present day."
)
df = load_basketball_teams(get_wbb_data=True)
print(df)
# Load in every men's basketball team
# from 2011 to present day.
print(
"Load in every men's basketball team " +
"from 2011 to present day."
)
df = load_basketball_teams()
print(df)
# Load in every men's basketball team
# from 2020 to present day.
print(
"Load in every men's basketball team " +
"from 2020 to present day."
)
df = load_basketball_teams(start_year=2020)
print(df)
Returns
A pandas DataFrame
object with a list of
all known college basketball teams.
511def get_basketball_team_schedule(team_id: int) -> pd.DataFrame: 512 """ 513 Retrieves a team schedule, from a valid NCAA basketball team ID. 514 515 Parameters 516 ---------- 517 `team_id` (int, mandatory): 518 Required argument. 519 Specifies the team you want a schedule from. 520 This is separate from a school ID, which identifies the institution. 521 A team ID should be unique to a school, and a season. 522 523 Usage 524 ---------- 525 ```python 526 527 from ncaa_stats_py.basketball import get_basketball_team_schedule 528 529 ######################################## 530 # Men's Basketball # 531 ######################################## 532 533 # Get the team schedule for the 534 # 2024 Wright St. MBB team (D1, ID: 561255). 535 print( 536 "Get the team schedule for the " + 537 "2024 Wright St. MBB team (D1, ID: 561255)." 538 ) 539 df = get_basketball_team_schedule(561255) 540 print(df) 541 542 # Get the team schedule for the 543 # 2023 Caldwell MBB team (D2, ID: 542813). 544 print( 545 "Get the team schedule for the " + 546 "2023 Caldwell MBB team (D2, ID: 542813)." 547 ) 548 df = get_basketball_team_schedule(542813) 549 print(df) 550 551 # Get the team schedule for the 552 # 2022 SUNY Maritime MBB team (D3, ID: 528097). 553 print( 554 "Get the team schedule for the " + 555 "2022 SUNY Maritime MBB team (D3, ID: 528097)." 556 ) 557 df = get_basketball_team_schedule(528097) 558 print(df) 559 560 ######################################## 561 # Women's Basketball # 562 ######################################## 563 564 # Get the team schedule for the 565 # 2021 Wake Forest WBB team (D1, ID: 506339). 566 print( 567 "Get the team schedule for the " + 568 "2021 Wake Forest WBB team (D1, ID: 506339)." 569 ) 570 df = get_basketball_team_schedule(506339) 571 print(df) 572 573 # Get the team schedule for the 574 # 2020 Trevecca Nazarene WBB team (D2, ID: 484527). 575 print( 576 "Get the team schedule for the " + 577 "2020 Trevecca Nazarene WBB team (D2, ID: 484527)." 578 ) 579 df = get_basketball_team_schedule(484527) 580 print(df) 581 582 # Get the team schedule for the 583 # 2019 Simpson WBB team (D3, ID: 452452). 584 print( 585 "Get the team schedule for the " + 586 "2019 Simpson WBB team (D3, ID: 452452)." 587 ) 588 df = get_basketball_team_schedule(452452) 589 print(df) 590 591 ``` 592 593 Returns 594 ---------- 595 A pandas `DataFrame` object with an NCAA basketball team's schedule. 596 597 """ 598 599 sport_id = "" 600 schools_df = _get_schools() 601 games_df = pd.DataFrame() 602 games_df_arr = [] 603 season = 0 604 temp_df = pd.DataFrame() 605 load_from_cache = True 606 607 home_dir = expanduser("~") 608 home_dir = _format_folder_str(home_dir) 609 610 url = f"https://stats.ncaa.org/teams/{team_id}" 611 612 try: 613 team_df = load_basketball_teams() 614 team_df = team_df[team_df["team_id"] == team_id] 615 season = team_df["season"].iloc[0] 616 ncaa_division = team_df["ncaa_division"].iloc[0] 617 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 618 sport_id = "MBB" 619 except Exception: 620 team_df = load_basketball_teams(get_wbb_data=True) 621 team_df = team_df[team_df["team_id"] == team_id] 622 season = team_df["season"].iloc[0] 623 ncaa_division = team_df["ncaa_division"].iloc[0] 624 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 625 sport_id = "WBB" 626 # team_conference_name = team_df["team_conference_name"].iloc[0] 627 # school_name = team_df["school_name"].iloc[0] 628 # school_id = int(team_df["school_id"].iloc[0]) 629 630 del team_df 631 632 if exists(f"{home_dir}/.ncaa_stats_py/"): 633 pass 634 else: 635 mkdir(f"{home_dir}/.ncaa_stats_py/") 636 637 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 638 pass 639 else: 640 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 641 642 if exists( 643 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 644 ): 645 pass 646 else: 647 mkdir( 648 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 649 ) 650 651 if exists( 652 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 653 + f"{team_id}_team_schedule.csv" 654 ): 655 games_df = pd.read_csv( 656 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/team_schedule/" 657 + f"{team_id}_team_schedule.csv" 658 ) 659 file_mod_datetime = datetime.fromtimestamp( 660 getmtime( 661 f"{home_dir}/.ncaa_stats_py/" 662 + f"basketball_{sport_id}/team_schedule/" 663 + f"{team_id}_team_schedule.csv" 664 ) 665 ) 666 else: 667 file_mod_datetime = datetime.today() 668 load_from_cache = False 669 670 now = datetime.today() 671 672 age = now - file_mod_datetime 673 if ( 674 age.days > 1 and 675 season >= now.year 676 ): 677 load_from_cache = False 678 679 if load_from_cache is True: 680 return games_df 681 682 response = _get_webpage(url=url) 683 soup = BeautifulSoup(response.text, features="lxml") 684 685 school_name = soup.find("div", {"class": "card"}).find("img").get("alt") 686 season_name = ( 687 soup.find("select", {"id": "year_list"}) 688 .find("option", {"selected": "selected"}) 689 .text 690 ) 691 # For NCAA basketball, the season always starts in the fall semester, 692 # and ends in the spring semester. 693 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 694 # because 2012 would encompass the fall and spring semesters 695 # for NCAA member institutions. 696 # season = f"{season_name[0:2]}{season_name[-2:]}" 697 # season = int(season) 698 soup = soup.find_all( 699 "div", 700 {"class": "col p-0"}, 701 ) 702 703 # declaring it here to prevent potential problems down the road. 704 table_data = "" 705 for s in soup: 706 try: 707 temp_name = s.find("div", {"class": "card-header"}) 708 temp_name = temp_name.text 709 except Exception as e: 710 logging.warning( 711 f"Could not parse card header. Full exception `{e}`. " 712 + "Attempting alternate method." 713 ) 714 temp_name = s.find("tr", {"class": "heading"}).find("td").text 715 716 if "schedule" in temp_name.lower(): 717 table_data = s.find("table") 718 719 t_rows = table_data.find_all("tr", {"class": "underline_rows"}) 720 721 if len(t_rows) == 0: 722 t_rows = table_data.find_all("tr") 723 724 for g in t_rows: 725 is_valid_row = True 726 game_num = 1 727 ot_periods = 0 728 is_home_game = True 729 is_neutral_game = False 730 731 cells = g.find_all("td") 732 if len(cells) <= 1: 733 # Because of how *well* designed 734 # stats.ncaa.org is, if we have to use execute 735 # the `if len(t_rows) == 0:` code, 736 # we need to catch any cases where every element in a 737 # table row (`<tr>`) is a table header (`<th>`), 738 # instead of a table data cell (`<td>`) 739 continue 740 741 game_date = cells[0].text 742 743 # If "(" is in the same cell as the date, 744 # this means that this game is an extra innings game. 745 # The number encased in `()` is the actual number of innings. 746 # We need to remove that from the date, 747 # and move it into a separate variable. 748 if "(" in game_date: 749 game_date = game_date.replace(")", "") 750 game_date, game_num = game_date.split("(") 751 game_date = game_date.strip() 752 game_num = int(game_num.strip()) 753 754 game_date = datetime.strptime(game_date, "%m/%d/%Y").date() 755 756 try: 757 opp_team_id = cells[1].find("a").get("href") 758 except IndexError: 759 logging.info( 760 "Skipping row because it is clearly " 761 + "not a row that has schedule data." 762 ) 763 is_valid_row = False 764 except AttributeError as e: 765 logging.info( 766 "Could not extract a team ID for this game. " + 767 f"Full exception {e}" 768 ) 769 opp_team_id = "-1" 770 except Exception as e: 771 logging.warning( 772 "An unhandled exception has occurred when " 773 + "trying to get the opposition team ID for this game. " 774 f"Full exception `{e}`." 775 ) 776 raise e 777 if is_valid_row is True: 778 if opp_team_id is not None: 779 opp_team_id = opp_team_id.replace("/teams/", "") 780 opp_team_id = int(opp_team_id) 781 782 try: 783 opp_team_name = cells[1].find("img").get("alt") 784 except AttributeError: 785 logging.info( 786 "Couldn't find the opposition team name " 787 + "for this row from an image element. " 788 + "Attempting a backup method" 789 ) 790 opp_team_name = cells[1].text 791 except Exception as e: 792 logging.info( 793 "Unhandled exception when trying to get the " 794 + "opposition team name from this game. " 795 + f"Full exception `{e}`" 796 ) 797 raise e 798 else: 799 opp_team_name = cells[1].text 800 801 if opp_team_name[0] == "@": 802 # The logic for determining if this game was a 803 # neutral site game doesn't care if that info is in 804 # `opp_team_name`. 805 opp_team_name = opp_team_name.strip().replace("@", "") 806 elif "@" in opp_team_name: 807 opp_team_name = opp_team_name.strip().split("@")[0] 808 # opp_team_show_name = cells[1].text.strip() 809 810 opp_text = cells[1].text 811 opp_text = opp_text.strip() 812 if "@" in opp_text and opp_text[0] == "@": 813 is_home_game = False 814 elif "@" in opp_text and opp_text[0] != "@": 815 is_neutral_game = True 816 is_home_game = False 817 # This is just to cover conference and NCAA championship 818 # tournaments. 819 elif "championship" in opp_text.lower(): 820 is_neutral_game = True 821 is_home_game = False 822 elif "ncaa" in opp_text.lower(): 823 is_neutral_game = True 824 is_home_game = False 825 826 del opp_text 827 828 score = cells[2].text.strip() 829 if len(score) == 0: 830 score_1 = 0 831 score_2 = 0 832 elif ( 833 "canceled" not in score.lower() and 834 "ppd" not in score.lower() 835 ): 836 score_1, score_2 = score.split("-") 837 838 # `score_1` should be "W `n`", "L `n`", or "T `n`", 839 # with `n` representing the number of runs this team 840 # scored in this game. 841 # Let's remove the "W", "L", or "T" from `score_1`, 842 # and determine which team won later on in this code. 843 if any(x in score_1 for x in ["W", "L", "T"]): 844 score_1 = score_1.split(" ")[1] 845 846 if "(" in score_2: 847 score_2 = score_2.replace(")", "") 848 score_2, ot_periods = score_2.split("(") 849 ot_periods = ot_periods.replace("OT", "") 850 ot_periods = ot_periods.replace(" ", "") 851 ot_periods = int(ot_periods) 852 853 if ot_periods is None: 854 ot_periods = 0 855 score_1 = int(score_1) 856 score_2 = int(score_2) 857 else: 858 score_1 = None 859 score_2 = None 860 861 try: 862 game_id = cells[2].find("a").get("href") 863 game_id = game_id.replace("/contests", "") 864 game_id = game_id.replace("/box_score", "") 865 game_id = game_id.replace("/", "") 866 game_id = int(game_id) 867 game_url = ( 868 f"https://stats.ncaa.org/contests/{game_id}/box_score" 869 ) 870 871 except AttributeError as e: 872 logging.info( 873 "Could not parse a game ID for this game. " 874 + f"Full exception `{e}`." 875 ) 876 game_id = None 877 game_url = None 878 except Exception as e: 879 logging.info( 880 "An unhandled exception occurred when trying " 881 + "to find a game ID for this game. " 882 + f"Full exception `{e}`." 883 ) 884 raise e 885 try: 886 attendance = cells[3].text 887 attendance = attendance.replace(",", "") 888 attendance = attendance.replace("\n", "") 889 attendance = int(attendance) 890 except IndexError as e: 891 logging.info( 892 "It doesn't appear as if there is an attendance column " 893 + "for this team's schedule table." 894 f"Full exception `{e}`." 895 ) 896 attendance = None 897 except ValueError as e: 898 logging.info( 899 "There doesn't appear as if " 900 + "there is a recorded attendance. " 901 + "for this game/row. " 902 f"Full exception `{e}`." 903 ) 904 attendance = None 905 906 except Exception as e: 907 logging.info( 908 "An unhandled exception occurred when trying " 909 + "to find this game's attendance. " 910 + f"Full exception `{e}`." 911 ) 912 raise e 913 914 if is_home_game is True: 915 temp_df = pd.DataFrame( 916 { 917 "season": season, 918 "season_name": season_name, 919 "game_id": game_id, 920 "game_date": game_date, 921 "game_num": game_num, 922 "ot_periods": ot_periods, 923 "home_team_id": team_id, 924 "home_team_name": school_name, 925 "away_team_id": opp_team_id, 926 "away_team_name": opp_team_name, 927 "home_team_score": score_1, 928 "away_team_score": score_2, 929 "is_neutral_game": is_neutral_game, 930 "game_url": game_url, 931 }, 932 index=[0], 933 ) 934 games_df_arr.append(temp_df) 935 del temp_df 936 elif is_neutral_game is True: 937 # For the sake of simplicity, 938 # order both team ID's, 939 # and set the lower number of the two as 940 # the "away" team in this neutral site game, 941 # just so there's no confusion if someone 942 # combines a ton of these team schedule `DataFrame`s, 943 # and wants to remove duplicates afterwards. 944 t_ids = [opp_team_id, team_id] 945 t_ids.sort() 946 947 if t_ids[0] == team_id: 948 # home 949 temp_df = pd.DataFrame( 950 { 951 "season": season, 952 "season_name": season_name, 953 "game_id": game_id, 954 "game_date": game_date, 955 "game_num": game_num, 956 "ot_periods": ot_periods, 957 "home_team_id": team_id, 958 "home_team_name": school_name, 959 "away_team_id": opp_team_id, 960 "away_team_name": opp_team_name, 961 "home_team_score": score_1, 962 "away_team_score": score_2, 963 "is_neutral_game": is_neutral_game, 964 "game_url": game_url, 965 }, 966 index=[0], 967 ) 968 969 else: 970 # away 971 temp_df = pd.DataFrame( 972 { 973 "season": season, 974 "season_name": season_name, 975 "game_id": game_id, 976 "game_date": game_date, 977 "game_num": game_num, 978 "ot_periods": ot_periods, 979 "home_team_id": opp_team_id, 980 "home_team_name": opp_team_name, 981 "away_team_id": team_id, 982 "away_team_name": school_name, 983 "home_team_score": score_2, 984 "away_team_score": score_1, 985 "is_neutral_game": is_neutral_game, 986 "game_url": game_url, 987 }, 988 index=[0], 989 ) 990 991 games_df_arr.append(temp_df) 992 del temp_df 993 else: 994 temp_df = pd.DataFrame( 995 { 996 "season": season, 997 "season_name": season_name, 998 "game_id": game_id, 999 "game_date": game_date, 1000 "game_num": game_num, 1001 "ot_periods": ot_periods, 1002 "home_team_id": opp_team_id, 1003 "home_team_name": opp_team_name, 1004 "away_team_id": team_id, 1005 "away_team_name": school_name, 1006 "home_team_score": score_2, 1007 "away_team_score": score_1, 1008 "is_neutral_game": is_neutral_game, 1009 "game_url": game_url, 1010 }, 1011 index=[0], 1012 ) 1013 1014 games_df_arr.append(temp_df) 1015 del temp_df 1016 1017 # team_photo = team_id.find("img").get("src") 1018 1019 games_df = pd.concat(games_df_arr, ignore_index=True) 1020 1021 temp_df = schools_df.rename( 1022 columns={ 1023 "school_name": "home_team_name", 1024 "school_id": "home_school_id" 1025 } 1026 ) 1027 games_df = games_df.merge(right=temp_df, on="home_team_name", how="left") 1028 1029 temp_df = schools_df.rename( 1030 columns={ 1031 "school_name": "away_team_name", 1032 "school_id": "away_school_id" 1033 } 1034 ) 1035 games_df = games_df.merge(right=temp_df, on="away_team_name", how="left") 1036 games_df["ncaa_division"] = ncaa_division 1037 games_df["ncaa_division_formatted"] = ncaa_division_formatted 1038 1039 # games_df["game_url"] = games_df["game_url"].str.replace("/box_score", "") 1040 games_df.to_csv( 1041 f"{home_dir}/.ncaa_stats_py/" 1042 + f"basketball_{sport_id}/team_schedule/" 1043 + f"{team_id}_team_schedule.csv", 1044 index=False, 1045 ) 1046 1047 return games_df
Retrieves a team schedule, from a valid NCAA basketball team ID.
Parameters
team_id
(int, mandatory):
Required argument.
Specifies the team you want a schedule from.
This is separate from a school ID, which identifies the institution.
A team ID should be unique to a school, and a season.
Usage
from ncaa_stats_py.basketball import get_basketball_team_schedule
########################################
# Men's Basketball #
########################################
# Get the team schedule for the
# 2024 Wright St. MBB team (D1, ID: 561255).
print(
"Get the team schedule for the " +
"2024 Wright St. MBB team (D1, ID: 561255)."
)
df = get_basketball_team_schedule(561255)
print(df)
# Get the team schedule for the
# 2023 Caldwell MBB team (D2, ID: 542813).
print(
"Get the team schedule for the " +
"2023 Caldwell MBB team (D2, ID: 542813)."
)
df = get_basketball_team_schedule(542813)
print(df)
# Get the team schedule for the
# 2022 SUNY Maritime MBB team (D3, ID: 528097).
print(
"Get the team schedule for the " +
"2022 SUNY Maritime MBB team (D3, ID: 528097)."
)
df = get_basketball_team_schedule(528097)
print(df)
########################################
# Women's Basketball #
########################################
# Get the team schedule for the
# 2021 Wake Forest WBB team (D1, ID: 506339).
print(
"Get the team schedule for the " +
"2021 Wake Forest WBB team (D1, ID: 506339)."
)
df = get_basketball_team_schedule(506339)
print(df)
# Get the team schedule for the
# 2020 Trevecca Nazarene WBB team (D2, ID: 484527).
print(
"Get the team schedule for the " +
"2020 Trevecca Nazarene WBB team (D2, ID: 484527)."
)
df = get_basketball_team_schedule(484527)
print(df)
# Get the team schedule for the
# 2019 Simpson WBB team (D3, ID: 452452).
print(
"Get the team schedule for the " +
"2019 Simpson WBB team (D3, ID: 452452)."
)
df = get_basketball_team_schedule(452452)
print(df)
Returns
A pandas DataFrame
object with an NCAA basketball team's schedule.
1050def get_basketball_day_schedule( 1051 game_date: str | date | datetime, 1052 level: str | int = "I", 1053 get_wbb_data: bool = False 1054): 1055 """ 1056 Given a date and NCAA level, this function retrieves basketball every game 1057 for that date. 1058 1059 Parameters 1060 ---------- 1061 `game_date` (int, mandatory): 1062 Required argument. 1063 Specifies the date you want a basketball schedule from. 1064 For best results, pass a string formatted as "YYYY-MM-DD". 1065 1066 `level` (int, mandatory): 1067 Required argument. 1068 Specifies the level/division you want a 1069 NCAA basketball schedule from. 1070 This can either be an integer (1-3) or a string ("I"-"III"). 1071 1072 `get_wbb_data` (bool, optional): 1073 Optional argument. 1074 If you want women's basketball data instead of men's basketball data, 1075 set this to `True`. 1076 1077 Usage 1078 ---------- 1079 ```python 1080 1081 from ncaa_stats_py.basketball import get_basketball_day_schedule 1082 1083 1084 # Get all DI games that will be played on April 22th, 2025. 1085 print("Get all games that will be played on April 22th, 2025.") 1086 df = get_basketball_day_schedule("2025-04-22", level=1) 1087 print(df) 1088 1089 # Get all division II games that were played on February 14th, 2025. 1090 print("Get all division II games that were played on February 14th, 2025.") 1091 df = get_basketball_day_schedule("2025-02-14", level="I") 1092 print(df) 1093 1094 # Get all DI games that were played on December 10th, 2024. 1095 print("Get all games that were played on December 10th, 2024.") 1096 df = get_basketball_day_schedule("2024-12-10", level="I") 1097 print(df) 1098 1099 # Get all DI games (if any) that were played on December 12th, 2024. 1100 print("Get all DI games (if any) that were played on December 12th, 2024.") 1101 df = get_basketball_day_schedule("2024-12-12") 1102 print(df) 1103 1104 # Get all DII games played on January 14th, 2024. 1105 print("Get all DI games played on January 14th, 2024.") 1106 df = get_basketball_day_schedule("2024-01-14") 1107 print(df) 1108 1109 # Get all division III games played on December 16th, 2023. 1110 print("Get all division III games played on December 16th, 2023.") 1111 df = get_basketball_day_schedule("2023-12-16") 1112 print(df) 1113 1114 ``` 1115 1116 Returns 1117 ---------- 1118 A pandas `DataFrame` object with all basketball games played on that day, 1119 for that NCAA division/level. 1120 1121 """ 1122 1123 season = 0 1124 sport_id = "MBB" 1125 1126 schedule_df = pd.DataFrame() 1127 schedule_df_arr = [] 1128 1129 if isinstance(game_date, date): 1130 game_datetime = datetime.combine( 1131 game_date, datetime.min.time() 1132 ) 1133 elif isinstance(game_date, datetime): 1134 game_datetime = game_date 1135 elif isinstance(game_date, str): 1136 game_datetime = parser.parse( 1137 game_date 1138 ) 1139 else: 1140 unhandled_datatype = type(game_date) 1141 raise ValueError( 1142 f"Unhandled datatype for `game_date`: `{unhandled_datatype}`" 1143 ) 1144 1145 if isinstance(level, int) and level == 1: 1146 formatted_level = "I" 1147 ncaa_level = 1 1148 elif isinstance(level, int) and level == 2: 1149 formatted_level = "II" 1150 ncaa_level = 2 1151 elif isinstance(level, int) and level == 3: 1152 formatted_level = "III" 1153 ncaa_level = 3 1154 elif isinstance(level, str) and ( 1155 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 1156 ): 1157 ncaa_level = 1 1158 formatted_level = level.upper() 1159 elif isinstance(level, str) and ( 1160 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 1161 ): 1162 ncaa_level = 2 1163 formatted_level = level.upper() 1164 elif isinstance(level, str) and ( 1165 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 1166 ): 1167 ncaa_level = 3 1168 formatted_level = level.upper() 1169 1170 del level 1171 1172 if get_wbb_data is True: 1173 sport_id = "WBB" 1174 elif get_wbb_data is False: 1175 sport_id = "MBB" 1176 else: 1177 raise ValueError( 1178 f"Unhandled value for `get_wbb_data`: `{get_wbb_data}`" 1179 ) 1180 1181 season = game_datetime.year 1182 game_month = game_datetime.month 1183 game_day = game_datetime.day 1184 game_year = game_datetime.year 1185 1186 if game_month > 7: 1187 season += 1 1188 url = ( 1189 "https://stats.ncaa.org/contests/" + 1190 f"livestream_scoreboards?utf8=%E2%9C%93&sport_code={sport_id}" + 1191 f"&academic_year={season}&division={ncaa_level}" + 1192 f"&game_date={game_month:00d}%2F{game_day:00d}%2F{game_year}" + 1193 "&commit=Submit" 1194 ) 1195 else: 1196 url = ( 1197 "https://stats.ncaa.org/contests/" + 1198 f"livestream_scoreboards?utf8=%E2%9C%93&sport_code={sport_id}" + 1199 f"&academic_year={season}&division={ncaa_level}" + 1200 f"&game_date={game_month:00d}%2F{game_day:00d}%2F{game_year}" + 1201 "&commit=Submit" 1202 ) 1203 1204 response = _get_webpage(url=url) 1205 soup = BeautifulSoup(response.text, features="lxml") 1206 1207 game_boxes = soup.find_all("div", {"class": "table-responsive"}) 1208 1209 for box in game_boxes: 1210 game_id = None 1211 game_alt_text = None 1212 game_num = 1 1213 # t_box = box.find("table") 1214 table_box = box.find("table") 1215 table_rows = table_box.find_all("tr") 1216 1217 # Date/attendance 1218 game_date_str = table_rows[0].find("div", {"class": "col-6 p-0"}).text 1219 game_date_str = game_date_str.replace("\n", "") 1220 game_date_str = game_date_str.strip() 1221 game_date_str = game_date_str.replace("TBA ", "TBA") 1222 game_date_str = game_date_str.replace("TBD ", "TBD") 1223 game_date_str = game_date_str.replace("PM ", "PM") 1224 game_date_str = game_date_str.replace("AM ", "AM") 1225 game_date_str = game_date_str.strip() 1226 attendance_str = table_rows[0].find( 1227 "div", 1228 {"class": "col p-0 text-right"} 1229 ).text 1230 1231 attendance_str = attendance_str.replace("Attend:", "") 1232 attendance_str = attendance_str.replace(",", "") 1233 attendance_str = attendance_str.replace("\n", "") 1234 if ( 1235 "st" in attendance_str.lower() or 1236 "nd" in attendance_str.lower() or 1237 "rd" in attendance_str.lower() or 1238 "th" in attendance_str.lower() 1239 ): 1240 # This is not an attendance, 1241 # this is whatever quarter/half/inning this game is in. 1242 attendance_num = None 1243 elif "final" in attendance_str.lower(): 1244 attendance_num = None 1245 elif len(attendance_str) > 0: 1246 attendance_num = int(attendance_str) 1247 else: 1248 attendance_num = None 1249 1250 if "(" in game_date_str: 1251 game_date_str = game_date_str.replace(")", "") 1252 game_date_str, game_num = game_date_str.split("(") 1253 game_num = int(game_num) 1254 1255 if "TBA" in game_date_str: 1256 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBA') 1257 elif "tba" in game_date_str: 1258 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tba') 1259 elif "TBD" in game_date_str: 1260 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBD') 1261 elif "tbd" in game_date_str: 1262 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tbd') 1263 elif ( 1264 "tbd" not in game_date_str.lower() and 1265 ":" not in game_date_str.lower() 1266 ): 1267 game_date_str = game_date_str.replace(" ", "") 1268 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y') 1269 else: 1270 game_datetime = datetime.strptime( 1271 game_date_str, 1272 '%m/%d/%Y %I:%M %p' 1273 ) 1274 game_datetime = game_datetime.astimezone(timezone("US/Eastern")) 1275 1276 game_alt_text = table_rows[1].find_all("td")[0].text 1277 if game_alt_text is not None and len(game_alt_text) > 0: 1278 game_alt_text = game_alt_text.replace("\n", "") 1279 game_alt_text = game_alt_text.strip() 1280 1281 if len(game_alt_text) == 0: 1282 game_alt_text = None 1283 1284 urls_arr = box.find_all("a") 1285 1286 for u in urls_arr: 1287 url_temp = u.get("href") 1288 if "contests" in url_temp: 1289 game_id = url_temp 1290 del url_temp 1291 1292 if game_id is None: 1293 for r in range(0, len(table_rows)): 1294 temp = table_rows[r] 1295 temp_id = temp.get("id") 1296 1297 if temp_id is not None and len(temp_id) > 0: 1298 game_id = temp_id 1299 1300 del urls_arr 1301 1302 game_id = game_id.replace("/contests", "") 1303 game_id = game_id.replace("/box_score", "") 1304 game_id = game_id.replace("/livestream_scoreboards", "") 1305 game_id = game_id.replace("/", "") 1306 game_id = game_id.replace("contest_", "") 1307 game_id = int(game_id) 1308 1309 table_rows = table_box.find_all("tr", {"id": f"contest_{game_id}"}) 1310 away_team_row = table_rows[0] 1311 home_team_row = table_rows[1] 1312 1313 # Away team 1314 td_arr = away_team_row.find_all("td") 1315 1316 try: 1317 away_team_name = td_arr[0].find("img").get("alt") 1318 except Exception: 1319 away_team_name = td_arr[1].text 1320 away_team_name = away_team_name.replace("\n", "") 1321 away_team_name = away_team_name.strip() 1322 1323 try: 1324 away_team_id = td_arr[1].find("a").get("href") 1325 away_team_id = away_team_id.replace("/teams/", "") 1326 away_team_id = int(away_team_id) 1327 except AttributeError: 1328 away_team_id = None 1329 logging.info("No team ID found for the away team") 1330 except Exception as e: 1331 raise e 1332 1333 away_points_scored = td_arr[-1].text 1334 away_points_scored = away_points_scored.replace("\n", "") 1335 away_points_scored = away_points_scored.replace("\xa0", "") 1336 if len(away_points_scored) > 0: 1337 away_points_scored = int(away_points_scored) 1338 else: 1339 away_points_scored = 0 1340 1341 del td_arr 1342 1343 # Home team 1344 td_arr = home_team_row.find_all("td") 1345 1346 try: 1347 home_team_name = td_arr[0].find("img").get("alt") 1348 except Exception: 1349 home_team_name = td_arr[1].text 1350 home_team_name = home_team_name.replace("\n", "") 1351 home_team_name = home_team_name.strip() 1352 1353 try: 1354 home_team_id = td_arr[1].find("a").get("href") 1355 home_team_id = home_team_id.replace("/teams/", "") 1356 home_team_id = int(home_team_id) 1357 except AttributeError: 1358 home_team_id = None 1359 logging.info("No team ID found for the home team") 1360 except Exception as e: 1361 raise e 1362 1363 home_points_scored = td_arr[-1].text 1364 home_points_scored = home_points_scored.replace("\n", "") 1365 home_points_scored = home_points_scored.replace("\xa0", "") 1366 if len(home_points_scored) > 0: 1367 home_points_scored = int(home_points_scored) 1368 else: 1369 home_points_scored = 0 1370 1371 temp_df = pd.DataFrame( 1372 { 1373 "season": season, 1374 "sport_id": sport_id, 1375 "game_date": game_datetime.strftime("%Y-%m-%d"), 1376 "game_datetime": game_datetime.isoformat(), 1377 "game_id": game_id, 1378 "formatted_level": formatted_level, 1379 "ncaa_level": ncaa_level, 1380 "game_alt_text": game_alt_text, 1381 "away_team_id": away_team_id, 1382 "away_team_name": away_team_name, 1383 "home_team_id": home_team_id, 1384 "home_team_name": home_team_name, 1385 "home_points_scored": home_points_scored, 1386 "away_points_scored": away_points_scored, 1387 "attendance": attendance_num 1388 }, 1389 index=[0] 1390 ) 1391 schedule_df_arr.append(temp_df) 1392 1393 del temp_df 1394 1395 if len(schedule_df_arr) >= 1: 1396 schedule_df = pd.concat(schedule_df_arr, ignore_index=True) 1397 else: 1398 logging.warning( 1399 "Could not find any game(s) for " 1400 + f"{game_datetime.year:00d}-{game_datetime.month:00d}" 1401 + f"-{game_datetime.day:00d}. " 1402 + "If you believe this is an error, " 1403 + "please raise an issue at " 1404 + "\n https://github.com/armstjc/ncaa_stats_py/issues \n" 1405 ) 1406 return schedule_df
Given a date and NCAA level, this function retrieves basketball every game for that date.
Parameters
game_date
(int, mandatory):
Required argument.
Specifies the date you want a basketball schedule from.
For best results, pass a string formatted as "YYYY-MM-DD".
level
(int, mandatory):
Required argument.
Specifies the level/division you want a
NCAA basketball schedule from.
This can either be an integer (1-3) or a string ("I"-"III").
get_wbb_data
(bool, optional):
Optional argument.
If you want women's basketball data instead of men's basketball data,
set this to True
.
Usage
from ncaa_stats_py.basketball import get_basketball_day_schedule
# Get all DI games that will be played on April 22th, 2025.
print("Get all games that will be played on April 22th, 2025.")
df = get_basketball_day_schedule("2025-04-22", level=1)
print(df)
# Get all division II games that were played on February 14th, 2025.
print("Get all division II games that were played on February 14th, 2025.")
df = get_basketball_day_schedule("2025-02-14", level="I")
print(df)
# Get all DI games that were played on December 10th, 2024.
print("Get all games that were played on December 10th, 2024.")
df = get_basketball_day_schedule("2024-12-10", level="I")
print(df)
# Get all DI games (if any) that were played on December 12th, 2024.
print("Get all DI games (if any) that were played on December 12th, 2024.")
df = get_basketball_day_schedule("2024-12-12")
print(df)
# Get all DII games played on January 14th, 2024.
print("Get all DI games played on January 14th, 2024.")
df = get_basketball_day_schedule("2024-01-14")
print(df)
# Get all division III games played on December 16th, 2023.
print("Get all division III games played on December 16th, 2023.")
df = get_basketball_day_schedule("2023-12-16")
print(df)
Returns
A pandas DataFrame
object with all basketball games played on that day,
for that NCAA division/level.
1409def get_full_basketball_schedule( 1410 season: int, 1411 level: str | int = "I", 1412 get_wbb_data: bool = False 1413) -> pd.DataFrame: 1414 """ 1415 Retrieves a full basketball schedule, 1416 from an NCAA level (`"I"`, `"II"`, `"III"`). 1417 The way this is done is by going through every team in a division, 1418 and parsing the schedules of every team in a division. 1419 1420 This function will take time when first run (30-60 minutes)! 1421 You have been warned. 1422 1423 Parameters 1424 ---------- 1425 `season` (int, mandatory): 1426 Specifies the season you want a schedule from. 1427 1428 `level` (int | str, mandatory): 1429 Specifies the team you want a schedule from. 1430 1431 `get_wbb_data` (bool, optional): 1432 Optional argument. 1433 If you want women's basketball data instead of men's basketball data, 1434 set this to `True`. 1435 1436 Usage 1437 ---------- 1438 ```python 1439 1440 from ncaa_stats_py.basketball import get_full_basketball_schedule 1441 1442 # Get the entire 2024 schedule for the 2024 D1 basketball season. 1443 print("Get the entire 2024 schedule for the 2024 D1 basketball season.") 1444 df = get_full_basketball_schedule(season=2024, level="I") 1445 print(df) 1446 1447 # You can also input `level` as an integer. 1448 # In addition, this and other functions cache data, 1449 # so this should load very quickly 1450 # compared to the first run of this function. 1451 print("You can also input `level` as an integer.") 1452 print( 1453 "In addition, this and other functions cache data, " 1454 + "so this should load very quickly " 1455 + "compared to the first run of this function." 1456 ) 1457 df = get_full_basketball_schedule(season=2024, level=1) 1458 print(df) 1459 1460 ``` 1461 1462 Returns 1463 ---------- 1464 A pandas `DataFrame` object with an NCAA basketball 1465 schedule for a specific season and level. 1466 """ 1467 1468 sport_id = "" 1469 load_from_cache = True 1470 home_dir = expanduser("~") 1471 home_dir = _format_folder_str(home_dir) 1472 schedule_df = pd.DataFrame() 1473 schedule_df_arr = [] 1474 temp_df = pd.DataFrame() 1475 formatted_level = "" 1476 ncaa_level = 0 1477 1478 if get_wbb_data is True: 1479 sport_id = "WBB" 1480 else: 1481 sport_id = "MBB" 1482 1483 if isinstance(level, int) and level == 1: 1484 formatted_level = "I" 1485 ncaa_level = 1 1486 elif isinstance(level, int) and level == 2: 1487 formatted_level = "II" 1488 ncaa_level = 2 1489 elif isinstance(level, int) and level == 3: 1490 formatted_level = "III" 1491 ncaa_level = 3 1492 elif isinstance(level, str) and ( 1493 level.lower() == "i" or level.lower() == "d1" or level.lower() == "1" 1494 ): 1495 ncaa_level = 1 1496 formatted_level = level.upper() 1497 elif isinstance(level, str) and ( 1498 level.lower() == "ii" or level.lower() == "d2" or level.lower() == "2" 1499 ): 1500 ncaa_level = 2 1501 formatted_level = level.upper() 1502 elif isinstance(level, str) and ( 1503 level.lower() == "iii" or level.lower() == "d3" or level.lower() == "3" 1504 ): 1505 ncaa_level = 3 1506 formatted_level = level.upper() 1507 1508 del level 1509 1510 if exists(f"{home_dir}/.ncaa_stats_py/"): 1511 pass 1512 else: 1513 mkdir(f"{home_dir}/.ncaa_stats_py/") 1514 1515 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 1516 pass 1517 else: 1518 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 1519 1520 if exists( 1521 f"{home_dir}/.ncaa_stats_py/" + 1522 f"basketball_{sport_id}/full_schedule/" 1523 ): 1524 pass 1525 else: 1526 mkdir( 1527 f"{home_dir}/.ncaa_stats_py/" + 1528 f"basketball_{sport_id}/full_schedule/" 1529 ) 1530 1531 if exists( 1532 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/full_schedule/" 1533 + f"{season}_{formatted_level}_full_schedule.csv" 1534 ): 1535 teams_df = pd.read_csv( 1536 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/full_schedule/" 1537 + f"{season}_{formatted_level}_full_schedule.csv" 1538 ) 1539 file_mod_datetime = datetime.fromtimestamp( 1540 getmtime( 1541 f"{home_dir}/.ncaa_stats_py/" + 1542 f"basketball_{sport_id}/full_schedule/" 1543 + f"{season}_{formatted_level}_full_schedule.csv" 1544 ) 1545 ) 1546 else: 1547 file_mod_datetime = datetime.today() 1548 load_from_cache = False 1549 1550 now = datetime.today() 1551 1552 age = now - file_mod_datetime 1553 1554 if ( 1555 age.days > 1 and 1556 season >= now.year 1557 ): 1558 load_from_cache = False 1559 1560 if load_from_cache is True: 1561 return teams_df 1562 1563 teams_df = load_basketball_teams() 1564 teams_df = teams_df[ 1565 (teams_df["season"] == season) & 1566 (teams_df["ncaa_division"] == ncaa_level) 1567 ] 1568 team_ids_arr = teams_df["team_id"].to_numpy() 1569 1570 for team_id in tqdm(team_ids_arr): 1571 temp_df = get_basketball_team_schedule(team_id=team_id) 1572 schedule_df_arr.append(temp_df) 1573 1574 schedule_df = pd.concat(schedule_df_arr, ignore_index=True) 1575 schedule_df = schedule_df.drop_duplicates(subset="game_id", keep="first") 1576 schedule_df.to_csv( 1577 f"{home_dir}/.ncaa_stats_py/" 1578 + f"basketball_{sport_id}/full_schedule/" 1579 + f"{season}_{formatted_level}_full_schedule.csv", 1580 index=False, 1581 ) 1582 return schedule_df
Retrieves a full basketball schedule,
from an NCAA level ("I"
, "II"
, "III"
).
The way this is done is by going through every team in a division,
and parsing the schedules of every team in a division.
This function will take time when first run (30-60 minutes)! You have been warned.
Parameters
season
(int, mandatory):
Specifies the season you want a schedule from.
level
(int | str, mandatory):
Specifies the team you want a schedule from.
get_wbb_data
(bool, optional):
Optional argument.
If you want women's basketball data instead of men's basketball data,
set this to True
.
Usage
from ncaa_stats_py.basketball import get_full_basketball_schedule
# Get the entire 2024 schedule for the 2024 D1 basketball season.
print("Get the entire 2024 schedule for the 2024 D1 basketball season.")
df = get_full_basketball_schedule(season=2024, level="I")
print(df)
# You can also input `level` as an integer.
# In addition, this and other functions cache data,
# so this should load very quickly
# compared to the first run of this function.
print("You can also input `level` as an integer.")
print(
"In addition, this and other functions cache data, "
+ "so this should load very quickly "
+ "compared to the first run of this function."
)
df = get_full_basketball_schedule(season=2024, level=1)
print(df)
Returns
A pandas DataFrame
object with an NCAA basketball
schedule for a specific season and level.
1585def get_basketball_team_roster(team_id: int) -> pd.DataFrame: 1586 """ 1587 Retrieves a basketball team's roster from a given team ID. 1588 1589 Parameters 1590 ---------- 1591 `team_id` (int, mandatory): 1592 Required argument. 1593 Specifies the team you want a roster from. 1594 This is separate from a school ID, which identifies the institution. 1595 A team ID should be unique to a school, and a season. 1596 1597 Usage 1598 ---------- 1599 ```python 1600 1601 from ncaa_stats_py.basketball import get_basketball_team_roster 1602 1603 ######################################## 1604 # Men's Basketball # 1605 ######################################## 1606 1607 # Get the basketball roster for the 1608 # 2024 Alabama St. MBB team (D1, ID: 560655). 1609 print( 1610 "Get the basketball roster for the " + 1611 "2024 Alabama St. MBB team (D1, ID: 560655)." 1612 ) 1613 df = get_basketball_team_roster(560655) 1614 print(df) 1615 1616 # Get the basketball roster for the 1617 # 2023 Roberts Wesleyan MBB team (D2, ID: 542994). 1618 print( 1619 "Get the basketball roster for the " + 1620 "2023 Roberts Wesleyan MBB team (D2, ID: 542994)." 1621 ) 1622 df = get_basketball_team_roster(542994) 1623 print(df) 1624 1625 # Get the basketball roster for the 1626 # 2022 Pacific Lutheran MBB team (D3, ID: 528255). 1627 print( 1628 "Get the basketball roster for the " + 1629 "2022 Pacific Lutheran MBB team (D3, ID: 528255)." 1630 ) 1631 df = get_basketball_team_roster(528255) 1632 print(df) 1633 1634 ######################################## 1635 # Women's Basketball # 1636 ######################################## 1637 1638 # Get the basketball roster for the 1639 # 2021 Michigan St. WBB team (D1, ID: 506069). 1640 print( 1641 "Get the basketball roster for the " + 1642 "2021 Michigan St. WBB team (D1, ID: 506069)." 1643 ) 1644 df = get_basketball_team_roster(506069) 1645 print(df) 1646 1647 # Get the basketball roster for the 1648 # 2020 Shippensburg WBB team (D2, ID: 484864). 1649 print( 1650 "Get the basketball roster for the " + 1651 "2020 Shippensburg WBB team (D2, ID: 484864)." 1652 ) 1653 df = get_basketball_team_roster(484864) 1654 print(df) 1655 1656 # Get the basketball roster for the 1657 # 2019 Maranatha Baptist team (D3, ID: 452546). 1658 print( 1659 "Get the basketball roster for the " + 1660 "2019 Maranatha Baptist team (D3, ID: 452546)." 1661 ) 1662 df = get_basketball_team_roster(452546) 1663 print(df) 1664 1665 ``` 1666 1667 Returns 1668 ---------- 1669 A pandas `DataFrame` object with 1670 an NCAA basketball team's roster for that season. 1671 """ 1672 sport_id = "" 1673 roster_df = pd.DataFrame() 1674 roster_df_arr = [] 1675 temp_df = pd.DataFrame() 1676 url = f"https://stats.ncaa.org/teams/{team_id}/roster" 1677 load_from_cache = True 1678 home_dir = expanduser("~") 1679 home_dir = _format_folder_str(home_dir) 1680 1681 stat_columns = [ 1682 "season", 1683 "season_name", 1684 "sport_id", 1685 "ncaa_division", 1686 "ncaa_division_formatted", 1687 "team_conference_name", 1688 "school_id", 1689 "school_name", 1690 "player_id", 1691 "player_jersey_num", 1692 "player_full_name", 1693 "player_first_name", 1694 "player_last_name", 1695 "player_class", 1696 "player_positions", 1697 "player_height_string", 1698 "player_weight", 1699 "player_hometown", 1700 "player_high_school", 1701 "player_G", 1702 "player_GS", 1703 "player_url", 1704 ] 1705 1706 try: 1707 team_df = load_basketball_teams() 1708 team_df = team_df[team_df["team_id"] == team_id] 1709 1710 season = team_df["season"].iloc[0] 1711 ncaa_division = team_df["ncaa_division"].iloc[0] 1712 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 1713 team_conference_name = team_df["team_conference_name"].iloc[0] 1714 school_name = team_df["school_name"].iloc[0] 1715 school_id = int(team_df["school_id"].iloc[0]) 1716 sport_id = "MBB" 1717 except Exception: 1718 team_df = load_basketball_teams(get_wbb_data=True) 1719 team_df = team_df[team_df["team_id"] == team_id] 1720 1721 season = team_df["season"].iloc[0] 1722 ncaa_division = team_df["ncaa_division"].iloc[0] 1723 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 1724 team_conference_name = team_df["team_conference_name"].iloc[0] 1725 school_name = team_df["school_name"].iloc[0] 1726 school_id = int(team_df["school_id"].iloc[0]) 1727 school_id = int(team_df["school_id"].iloc[0]) 1728 sport_id = "WBB" 1729 1730 del team_df 1731 1732 if exists(f"{home_dir}/.ncaa_stats_py/"): 1733 pass 1734 else: 1735 mkdir(f"{home_dir}/.ncaa_stats_py/") 1736 1737 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 1738 pass 1739 else: 1740 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 1741 1742 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/"): 1743 pass 1744 else: 1745 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/") 1746 1747 if exists( 1748 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1749 f"{team_id}_roster.csv" 1750 ): 1751 teams_df = pd.read_csv( 1752 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1753 f"{team_id}_roster.csv" 1754 ) 1755 file_mod_datetime = datetime.fromtimestamp( 1756 getmtime( 1757 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1758 f"{team_id}_roster.csv" 1759 ) 1760 ) 1761 else: 1762 file_mod_datetime = datetime.today() 1763 load_from_cache = False 1764 1765 now = datetime.today() 1766 1767 age = now - file_mod_datetime 1768 1769 if ( 1770 age.days >= 14 and 1771 season >= now.year 1772 ): 1773 load_from_cache = False 1774 1775 if load_from_cache is True: 1776 return teams_df 1777 1778 response = _get_webpage(url=url) 1779 soup = BeautifulSoup(response.text, features="lxml") 1780 try: 1781 school_name = soup.find( 1782 "div", 1783 {"class": "card"} 1784 ).find("img").get("alt") 1785 except Exception: 1786 school_name = soup.find("div", {"class": "card"}).find("a").text 1787 school_name = school_name.rsplit(" ", maxsplit=1)[0] 1788 1789 season_name = ( 1790 soup.find("select", {"id": "year_list"}) 1791 .find("option", {"selected": "selected"}) 1792 .text 1793 ) 1794 # For NCAA basketball, the season always starts in the spring semester, 1795 # and ends in the fall semester. 1796 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 1797 # because 2012 would encompass the spring and fall semesters 1798 # for NCAA member institutions. 1799 season = f"{season_name[0:2]}{season_name[-2:]}" 1800 season = int(season) 1801 1802 try: 1803 table = soup.find( 1804 "table", 1805 {"class": "dataTable small_font"}, 1806 ) 1807 1808 table_headers = table.find("thead").find_all("th") 1809 except Exception: 1810 table = soup.find( 1811 "table", 1812 {"class": "dataTable small_font no_padding"}, 1813 ) 1814 1815 table_headers = table.find("thead").find_all("th") 1816 table_headers = [x.text for x in table_headers] 1817 1818 t_rows = table.find("tbody").find_all("tr") 1819 1820 for t in t_rows: 1821 t_cells = t.find_all("td") 1822 t_cells = [x.text for x in t_cells] 1823 1824 temp_df = pd.DataFrame( 1825 data=[t_cells], 1826 columns=table_headers, 1827 # index=[0] 1828 ) 1829 1830 player_id = t.find("a").get("href") 1831 # temp_df["school_name"] = school_name 1832 temp_df["player_url"] = f"https://stats.ncaa.org{player_id}" 1833 1834 player_id = player_id.replace("/players", "").replace("/", "") 1835 player_id = int(player_id) 1836 1837 temp_df["player_id"] = player_id 1838 1839 roster_df_arr.append(temp_df) 1840 del temp_df 1841 1842 roster_df = pd.concat(roster_df_arr, ignore_index=True) 1843 roster_df = roster_df.infer_objects() 1844 roster_df["season"] = season 1845 roster_df["season_name"] = season_name 1846 roster_df["ncaa_division"] = ncaa_division 1847 roster_df["ncaa_division_formatted"] = ncaa_division_formatted 1848 roster_df["team_conference_name"] = team_conference_name 1849 roster_df["school_id"] = school_id 1850 roster_df["school_name"] = school_name 1851 roster_df["sport_id"] = sport_id 1852 1853 roster_df.rename( 1854 columns={ 1855 "GP": "player_G", 1856 "GS": "player_GS", 1857 "#": "player_jersey_num", 1858 "Name": "player_full_name", 1859 "Class": "player_class", 1860 "Position": "player_positions", 1861 "Height": "player_height_string", 1862 "Hometown": "player_hometown", 1863 "High School": "player_high_school", 1864 }, 1865 inplace=True 1866 ) 1867 1868 roster_df[["player_first_name", "player_last_name"]] = roster_df[ 1869 "player_full_name" 1870 ].str.split(" ", n=1, expand=True) 1871 roster_df = roster_df.infer_objects() 1872 1873 for i in roster_df.columns: 1874 if i in stat_columns: 1875 pass 1876 else: 1877 raise ValueError( 1878 f"Unhandled column name {i}" 1879 ) 1880 1881 roster_df = roster_df.infer_objects().reindex(columns=stat_columns) 1882 1883 roster_df.to_csv( 1884 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/rosters/" + 1885 f"{team_id}_roster.csv", 1886 index=False, 1887 ) 1888 return roster_df
Retrieves a basketball team's roster from a given team ID.
Parameters
team_id
(int, mandatory):
Required argument.
Specifies the team you want a roster from.
This is separate from a school ID, which identifies the institution.
A team ID should be unique to a school, and a season.
Usage
from ncaa_stats_py.basketball import get_basketball_team_roster
########################################
# Men's Basketball #
########################################
# Get the basketball roster for the
# 2024 Alabama St. MBB team (D1, ID: 560655).
print(
"Get the basketball roster for the " +
"2024 Alabama St. MBB team (D1, ID: 560655)."
)
df = get_basketball_team_roster(560655)
print(df)
# Get the basketball roster for the
# 2023 Roberts Wesleyan MBB team (D2, ID: 542994).
print(
"Get the basketball roster for the " +
"2023 Roberts Wesleyan MBB team (D2, ID: 542994)."
)
df = get_basketball_team_roster(542994)
print(df)
# Get the basketball roster for the
# 2022 Pacific Lutheran MBB team (D3, ID: 528255).
print(
"Get the basketball roster for the " +
"2022 Pacific Lutheran MBB team (D3, ID: 528255)."
)
df = get_basketball_team_roster(528255)
print(df)
########################################
# Women's Basketball #
########################################
# Get the basketball roster for the
# 2021 Michigan St. WBB team (D1, ID: 506069).
print(
"Get the basketball roster for the " +
"2021 Michigan St. WBB team (D1, ID: 506069)."
)
df = get_basketball_team_roster(506069)
print(df)
# Get the basketball roster for the
# 2020 Shippensburg WBB team (D2, ID: 484864).
print(
"Get the basketball roster for the " +
"2020 Shippensburg WBB team (D2, ID: 484864)."
)
df = get_basketball_team_roster(484864)
print(df)
# Get the basketball roster for the
# 2019 Maranatha Baptist team (D3, ID: 452546).
print(
"Get the basketball roster for the " +
"2019 Maranatha Baptist team (D3, ID: 452546)."
)
df = get_basketball_team_roster(452546)
print(df)
Returns
A pandas DataFrame
object with
an NCAA basketball team's roster for that season.
1891def get_basketball_player_season_stats( 1892 team_id: int, 1893) -> pd.DataFrame: 1894 """ 1895 Given a team ID, this function retrieves and parses 1896 the season stats for all of the players in a given basketball team. 1897 1898 Parameters 1899 ---------- 1900 `team_id` (int, mandatory): 1901 Required argument. 1902 Specifies the team you want basketball stats from. 1903 This is separate from a school ID, which identifies the institution. 1904 A team ID should be unique to a school, and a season. 1905 1906 Usage 1907 ---------- 1908 ```python 1909 1910 from ncaa_stats_py.basketball import get_basketball_player_season_stats 1911 1912 ######################################## 1913 # Men's Basketball # 1914 ######################################## 1915 1916 # Get the season stats for the 1917 # 2024 Illinois MBB team (D1, ID: 560955). 1918 print( 1919 "Get the season stats for the " + 1920 "2024 Illinois MBB team (D1, ID: 560955)." 1921 ) 1922 df = get_basketball_player_season_stats(560955) 1923 print(df) 1924 1925 # Get the season stats for the 1926 # 2023 Chico St. MBB team (D2, ID: 542605). 1927 print( 1928 "Get the season stats for the " + 1929 "2023 Chico St. MBB team (D2, ID: 542605)." 1930 ) 1931 df = get_basketball_player_season_stats(542605) 1932 print(df) 1933 1934 # Get the season stats for the 1935 # 2022 Maine Maritime MBB team (D3, ID: 528070). 1936 print( 1937 "Get the season stats for the " + 1938 "2022 Maine Maritime MBB team (D3, ID: 528070)." 1939 ) 1940 df = get_basketball_player_season_stats(528070) 1941 print(df) 1942 1943 ######################################## 1944 # Women's Basketball # 1945 ######################################## 1946 1947 # Get the season stats for the 1948 # 2021 Louisville WBB team (D1, ID: 506050). 1949 print( 1950 "Get the season stats for the " + 1951 "2021 Louisville WBB team (D1, ID: 506050)." 1952 ) 1953 df = get_basketball_player_season_stats(506050) 1954 print(df) 1955 1956 # Get the season stats for the 1957 # 2020 Paine WBB team (D2, ID: 484830). 1958 print( 1959 "Get the season stats for the " + 1960 "2020 Paine WBB team (D2, ID: 484830)." 1961 ) 1962 df = get_basketball_player_season_stats(484830) 1963 print(df) 1964 1965 # Get the season stats for the 1966 # 2019 Pomona-Pitzer team (D3, ID: 452413). 1967 print( 1968 "Get the season stats for the " + 1969 "2019 Pomona-Pitzer team (D3, ID: 452413)." 1970 ) 1971 df = get_basketball_player_season_stats(452413) 1972 print(df) 1973 1974 ``` 1975 1976 Returns 1977 ---------- 1978 A pandas `DataFrame` object with the season batting stats for 1979 all players with a given NCAA basketball team. 1980 """ 1981 1982 sport_id = "" 1983 load_from_cache = True 1984 stats_df = pd.DataFrame() 1985 stats_df_arr = [] 1986 temp_df = pd.DataFrame() 1987 1988 stat_columns = [ 1989 "season", 1990 "season_name", 1991 "sport_id", 1992 "team_id", 1993 "team_conference_name", 1994 "school_id", 1995 "school_name", 1996 "ncaa_division", 1997 "ncaa_division_formatted", 1998 "player_id", 1999 "player_jersey_number", 2000 "player_last_name", 2001 "player_first_name", 2002 "player_full_name", 2003 "player_class", 2004 "player_position", 2005 "player_height", 2006 "GP", 2007 "GS", 2008 "MP_str", 2009 "MP_minutes", 2010 "MP_seconds", 2011 "MP_total_seconds", 2012 "FGM", 2013 "FGA", 2014 "FG%", 2015 "eFG%", 2016 "TSA", 2017 "TS%", 2018 "2PM", 2019 "2PA", 2020 "2FG%", 2021 "3PM", 2022 "3PA", 2023 "3FG%", 2024 "FT", 2025 "FTA", 2026 "FT%", 2027 "PTS", 2028 "ORB", 2029 "DRB", 2030 "TRB", 2031 "Avg", 2032 "AST", 2033 "TOV", 2034 "TOV%", 2035 "STL", 2036 "BLK", 2037 "PF", 2038 "DBL_DBL", 2039 "TRP_DBL", 2040 "DQ", 2041 "TF", 2042 ] 2043 2044 # if get_wbb_data is True: 2045 # sport_id = "WBB" 2046 # else: 2047 # sport_id = "MBB" 2048 2049 try: 2050 team_df = load_basketball_teams() 2051 2052 team_df = team_df[team_df["team_id"] == team_id] 2053 2054 season = team_df["season"].iloc[0] 2055 ncaa_division = int(team_df["ncaa_division"].iloc[0]) 2056 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2057 team_conference_name = team_df["team_conference_name"].iloc[0] 2058 school_name = team_df["school_name"].iloc[0] 2059 school_id = int(team_df["school_id"].iloc[0]) 2060 sport_id = "MBB" 2061 except Exception: 2062 team_df = load_basketball_teams(get_wbb_data=True) 2063 2064 team_df = team_df[team_df["team_id"] == team_id] 2065 2066 season = team_df["season"].iloc[0] 2067 ncaa_division = int(team_df["ncaa_division"].iloc[0]) 2068 ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2069 team_conference_name = team_df["team_conference_name"].iloc[0] 2070 school_name = team_df["school_name"].iloc[0] 2071 school_id = int(team_df["school_id"].iloc[0]) 2072 sport_id = "WBB" 2073 2074 del team_df 2075 2076 # stat_id = _get_stat_id( 2077 # sport="basketball", 2078 # season=season, 2079 # stat_type="batting" 2080 # ) 2081 2082 home_dir = expanduser("~") 2083 home_dir = _format_folder_str(home_dir) 2084 2085 url = f"https://stats.ncaa.org/teams/{team_id}/season_to_date_stats" 2086 2087 if exists(f"{home_dir}/.ncaa_stats_py/"): 2088 pass 2089 else: 2090 mkdir(f"{home_dir}/.ncaa_stats_py/") 2091 2092 if exists(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/"): 2093 pass 2094 else: 2095 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/") 2096 2097 if exists( 2098 f"{home_dir}/.ncaa_stats_py/" + 2099 f"basketball_{sport_id}/player_season_stats/" 2100 ): 2101 pass 2102 else: 2103 mkdir( 2104 f"{home_dir}/.ncaa_stats_py/" + 2105 f"basketball_{sport_id}/player_season_stats/" 2106 ) 2107 2108 if exists( 2109 f"{home_dir}/.ncaa_stats_py/" + 2110 f"basketball_{sport_id}/player_season_stats/" 2111 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2112 ): 2113 games_df = pd.read_csv( 2114 f"{home_dir}/.ncaa_stats_py/" + 2115 f"basketball_{sport_id}/player_season_stats/" 2116 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2117 ) 2118 file_mod_datetime = datetime.fromtimestamp( 2119 getmtime( 2120 f"{home_dir}/.ncaa_stats_py/" + 2121 f"basketball_{sport_id}/player_season_stats/" 2122 + f"{season:00d}_{school_id:00d}_player_season_stats.csv" 2123 ) 2124 ) 2125 else: 2126 file_mod_datetime = datetime.today() 2127 load_from_cache = False 2128 2129 now = datetime.today() 2130 2131 age = now - file_mod_datetime 2132 2133 if ( 2134 age.days > 1 and 2135 season >= now.year 2136 ): 2137 load_from_cache = False 2138 2139 if load_from_cache is True: 2140 return games_df 2141 2142 response = _get_webpage(url=url) 2143 soup = BeautifulSoup(response.text, features="lxml") 2144 # try: 2145 # school_name = soup.find( 2146 # "div", {"class": "card"} 2147 # ).find("img").get("alt") 2148 # except Exception: 2149 # school_name = soup.find("div", {"class": "card"}).find("a").text 2150 # school_name = school_name.rsplit(" ", maxsplit=1)[0] 2151 2152 season_name = ( 2153 soup.find("select", {"id": "year_list"}) 2154 .find("option", {"selected": "selected"}) 2155 .text 2156 ) 2157 # For NCAA basketball, the season always starts in the fall semester, 2158 # and ends in the spring semester. 2159 # Thus, if `season_name` = "2011-12", this is the "2012" basketball season, 2160 # because 2012 would encompass the fall and spring semesters 2161 # for NCAA member institutions. 2162 season = f"{season_name[0:2]}{season_name[-2:]}" 2163 season = int(season) 2164 2165 # stat_categories_arr = soup.find( 2166 # "ul", {"class": "nav nav-tabs padding-nav"} 2167 # ).find_all("a") 2168 2169 table_data = soup.find( 2170 "table", 2171 {"id": "stat_grid", "class": "small_font dataTable table-bordered"}, 2172 ) 2173 2174 temp_table_headers = table_data.find("thead").find("tr").find_all("th") 2175 table_headers = [x.text for x in temp_table_headers] 2176 2177 del temp_table_headers 2178 2179 t_rows = table_data.find("tbody").find_all("tr", {"class": "text"}) 2180 for t in t_rows: 2181 p_last = "" 2182 p_first = "" 2183 t_cells = t.find_all("td") 2184 if "team" in t_cells[1].text.lower(): 2185 continue 2186 p_sortable = t_cells[1].get("data-order") 2187 if len(p_sortable) == 2: 2188 p_last, p_first = p_sortable.split(",") 2189 elif len(p_sortable) == 3: 2190 p_last, temp_name, p_first = p_sortable.split(",") 2191 p_last = f"{p_last} {temp_name}" 2192 2193 t_cells = [x.text.strip() for x in t_cells] 2194 2195 temp_df = pd.DataFrame( 2196 data=[t_cells], 2197 columns=table_headers, 2198 # index=[0] 2199 ) 2200 2201 player_id = t.find("a").get("href") 2202 2203 # temp_df["player_url"] = f"https://stats.ncaa.org{player_id}" 2204 player_id = player_id.replace("/players", "").replace("/", "") 2205 2206 # stat_id = -1 2207 # if "year_stat_category_id" in player_id: 2208 # stat_id = player_id 2209 # stat_id = stat_id.rsplit("?")[-1] 2210 # stat_id = stat_id.replace("?", "").replace( 2211 # "year_stat_category_id=", "" 2212 # ) 2213 # stat_id = int(stat_id) 2214 2215 # player_id = player_id.split("?")[0] 2216 2217 player_id = int(player_id) 2218 2219 temp_df["player_id"] = player_id 2220 temp_df["player_last_name"] = p_last.strip() 2221 temp_df["player_first_name"] = p_first.strip() 2222 2223 stats_df_arr.append(temp_df) 2224 del temp_df 2225 2226 stats_df = pd.concat(stats_df_arr, ignore_index=True) 2227 stats_df = stats_df.replace("", None) 2228 2229 # stats_df["stat_id"] = stat_id 2230 stats_df["season"] = season 2231 stats_df["season_name"] = season_name 2232 stats_df["school_id"] = school_id 2233 stats_df["school_name"] = school_name 2234 stats_df["ncaa_division"] = ncaa_division 2235 stats_df["ncaa_division_formatted"] = ncaa_division_formatted 2236 stats_df["team_conference_name"] = team_conference_name 2237 stats_df["sport_id"] = sport_id 2238 stats_df["team_id"] = team_id 2239 2240 stats_df = stats_df.infer_objects() 2241 2242 stats_df.rename( 2243 columns={ 2244 "#": "player_jersey_number", 2245 "Player": "player_full_name", 2246 "Yr": "player_class", 2247 "Pos": "player_position", 2248 "Ht": "player_height", 2249 "B/T": "player_bats_throws", 2250 "3FG": "3PM", 2251 "3FGA": "3PA", 2252 "ORebs": "ORB", 2253 "DRebs": "DRB", 2254 "Tot Reb": "TRB", 2255 "TO": "TOV", 2256 "Dbl Dbl": "DBL_DBL", 2257 "Trpl Dbl": "TRP_DBL", 2258 "Fouls": "PF", 2259 'Tech Fouls': "TF", 2260 'Effective FG Pct.': "eFG%", 2261 "MP": "MP_str", 2262 "Min": "MP_str", 2263 "Off Reb": "ORB", 2264 "Def Reb": "DRB", 2265 "ST": "STL", 2266 "BLKS": "BLK" 2267 }, 2268 inplace=True, 2269 ) 2270 stats_df = stats_df.infer_objects().fillna(0) 2271 stats_df = stats_df.astype( 2272 { 2273 "GP": "uint16", 2274 "GS": "uint16", 2275 "FGM": "uint16", 2276 "FGA": "uint16", 2277 "3PM": "uint16", 2278 "3PA": "uint16", 2279 "FT": "uint16", 2280 "FTA": "uint16", 2281 "PTS": "uint16", 2282 "ORB": "uint16", 2283 "DRB": "uint16", 2284 "TRB": "uint16", 2285 "AST": "uint16", 2286 "TOV": "uint16", 2287 "STL": "uint16", 2288 "BLK": "uint16", 2289 "PF": "uint16", 2290 "DBL_DBL": "uint16", 2291 "TRP_DBL": "uint16", 2292 "school_id": "uint32", 2293 } 2294 ) 2295 2296 # This is a separate function call because these stats 2297 # *don't* exist in every season. 2298 if "DQ" not in stats_df.columns: 2299 stats_df["DQ"] = None 2300 2301 if "TF" not in stats_df.columns: 2302 stats_df["TF"] = None 2303 2304 stats_df = stats_df.astype( 2305 { 2306 "DQ": "uint16", 2307 "TF": "uint16", 2308 }, 2309 errors="ignore" 2310 ) 2311 2312 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 2313 ":", expand=True 2314 ) 2315 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2316 "MP_minutes", "MP_seconds" 2317 ]].astype("uint64") 2318 stats_df["MP_total_seconds"] = ( 2319 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 2320 ) 2321 2322 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 2323 stats_df["FG%"] = stats_df["FG%"].round(4) 2324 2325 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 2326 stats_df["3P%"] = stats_df["3P%"].round(4) 2327 2328 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 2329 stats_df["FT%"] = stats_df["FT%"].round(4) 2330 2331 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 2332 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 2333 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 2334 stats_df["2P%"] = stats_df["2P%"].round(4) 2335 2336 stats_df["eFG%"] = ( 2337 ( 2338 stats_df["FGM"] + 2339 (stats_df["3PM"] * 0.5) 2340 ) / 2341 stats_df["FGA"] 2342 ) 2343 stats_df["eFG%"] = stats_df["eFG%"].round(4) 2344 2345 stats_df["TSA"] = ( 2346 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 2347 ) 2348 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 2349 stats_df["TS%"] = stats_df["TS%"].round(4) 2350 2351 stats_df["TOV%"] = ( 2352 stats_df["TOV"] / 2353 ( 2354 stats_df["FGA"] + 2355 (stats_df["FTA"] * 0.44) + 2356 stats_df["TOV"] 2357 ) 2358 ) 2359 stats_df["TOV%"] = stats_df["TOV%"].round(4) 2360 # In many seasons, there is an ["Avg"] column 2361 # that would otherwise completely screw up 2362 # any attempts to use the final DataFrame, 2363 # because it would be a duplicate column 2364 # that pandas wouldn't complain about 2365 # until it's too late. 2366 2367 duplicate_cols = stats_df.columns[stats_df.columns.duplicated()] 2368 stats_df.drop(columns=duplicate_cols, inplace=True) 2369 # stats_df = stats_df.T.drop_duplicates().T 2370 stats_df = stats_df.reindex(columns=stat_columns) 2371 # print(stats_df.columns) 2372 stats_df.to_csv( 2373 f"{home_dir}/.ncaa_stats_py/" + 2374 f"basketball_{sport_id}/player_season_stats/" + 2375 f"{season:00d}_{school_id:00d}_player_season_stats.csv", 2376 index=False, 2377 ) 2378 2379 return stats_df
Given a team ID, this function retrieves and parses the season stats for all of the players in a given basketball team.
Parameters
team_id
(int, mandatory):
Required argument.
Specifies the team you want basketball stats from.
This is separate from a school ID, which identifies the institution.
A team ID should be unique to a school, and a season.
Usage
from ncaa_stats_py.basketball import get_basketball_player_season_stats
########################################
# Men's Basketball #
########################################
# Get the season stats for the
# 2024 Illinois MBB team (D1, ID: 560955).
print(
"Get the season stats for the " +
"2024 Illinois MBB team (D1, ID: 560955)."
)
df = get_basketball_player_season_stats(560955)
print(df)
# Get the season stats for the
# 2023 Chico St. MBB team (D2, ID: 542605).
print(
"Get the season stats for the " +
"2023 Chico St. MBB team (D2, ID: 542605)."
)
df = get_basketball_player_season_stats(542605)
print(df)
# Get the season stats for the
# 2022 Maine Maritime MBB team (D3, ID: 528070).
print(
"Get the season stats for the " +
"2022 Maine Maritime MBB team (D3, ID: 528070)."
)
df = get_basketball_player_season_stats(528070)
print(df)
########################################
# Women's Basketball #
########################################
# Get the season stats for the
# 2021 Louisville WBB team (D1, ID: 506050).
print(
"Get the season stats for the " +
"2021 Louisville WBB team (D1, ID: 506050)."
)
df = get_basketball_player_season_stats(506050)
print(df)
# Get the season stats for the
# 2020 Paine WBB team (D2, ID: 484830).
print(
"Get the season stats for the " +
"2020 Paine WBB team (D2, ID: 484830)."
)
df = get_basketball_player_season_stats(484830)
print(df)
# Get the season stats for the
# 2019 Pomona-Pitzer team (D3, ID: 452413).
print(
"Get the season stats for the " +
"2019 Pomona-Pitzer team (D3, ID: 452413)."
)
df = get_basketball_player_season_stats(452413)
print(df)
Returns
A pandas DataFrame
object with the season batting stats for
all players with a given NCAA basketball team.
2382def get_basketball_player_game_stats( 2383 player_id: int, 2384 season: int 2385) -> pd.DataFrame: 2386 """ 2387 Given a valid player ID and season, 2388 this function retrieves the game stats for this player at a game level. 2389 2390 Parameters 2391 ---------- 2392 `player_id` (int, mandatory): 2393 Required argument. 2394 Specifies the player you want game stats from. 2395 2396 `season` (int, mandatory): 2397 Required argument. 2398 Specifies the season you want game stats from. 2399 2400 Usage 2401 ---------- 2402 ```python 2403 2404 from ncaa_stats_py.basketball import ( 2405 get_basketball_player_game_stats 2406 ) 2407 2408 # Get the batting stats of Jacob Berry in 2022 (LSU). 2409 print( 2410 "Get the batting stats of Jacob Berry in 2022 (LSU)." 2411 ) 2412 df = get_basketball_player_game_stats(player_id=7579336, season=2022) 2413 print(df) 2414 2415 # Get the batting stats of Alec Burleson in 2019 (ECU). 2416 print( 2417 "Get the batting stats of Alec Burleson in 2019 (ECU)." 2418 ) 2419 df = get_basketball_player_game_stats(player_id=6015715, season=2019) 2420 print(df) 2421 2422 # Get the batting stats of Hunter Bishop in 2018 (Arizona St.). 2423 print( 2424 "Get the batting stats of Hunter Bishop in 2018 (Arizona St.)." 2425 ) 2426 df = get_basketball_player_game_stats(player_id=6014052, season=2019) 2427 print(df) 2428 2429 ``` 2430 2431 Returns 2432 ---------- 2433 A pandas `DataFrame` object with a player's batting game logs 2434 in a given season. 2435 """ 2436 sport_id = "" 2437 2438 stat_columns = [ 2439 "season", 2440 "game_id", 2441 "game_num", 2442 "player_id", 2443 "date", 2444 "opponent", 2445 "Result", 2446 "team_score", 2447 "opponent_score", 2448 "MP_str", 2449 "MP_minutes", 2450 "MP_seconds", 2451 "MP_total_seconds", 2452 "GP", 2453 "GS", 2454 "FGM", 2455 "FGA", 2456 "FG%", 2457 "eFG%", 2458 "2PM", 2459 "2PA", 2460 "2P%", 2461 "3PM", 2462 "3PA", 2463 "3P%", 2464 "FT", 2465 "FTA", 2466 "FT%", 2467 "ORB", 2468 "DRB", 2469 "TRB", 2470 "AST", 2471 "TOV", 2472 "TOV%", 2473 "STL", 2474 "BLK", 2475 "PF", 2476 "DQ", 2477 "TF", 2478 "TSA", 2479 "TS%", 2480 "PTS", 2481 "DBL_DBL", 2482 "TRP_DBL", 2483 ] 2484 load_from_cache = True 2485 stats_df = pd.DataFrame() 2486 stats_df_arr = [] 2487 temp_df = pd.DataFrame() 2488 home_dir = expanduser("~") 2489 home_dir = _format_folder_str(home_dir) 2490 2491 # stat_id = _get_stat_id( 2492 # sport="basketball", 2493 # season=season, 2494 # stat_type="batting" 2495 # ) 2496 url = f"https://stats.ncaa.org/players/{player_id}" 2497 2498 if exists(f"{home_dir}/.ncaa_stats_py/"): 2499 pass 2500 else: 2501 mkdir(f"{home_dir}/.ncaa_stats_py/") 2502 2503 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 2504 pass 2505 else: 2506 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 2507 2508 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/"): 2509 pass 2510 else: 2511 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/") 2512 2513 if exists( 2514 f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/" 2515 + f"{season}_{player_id}_player_game_stats.csv" 2516 ): 2517 games_df = pd.read_csv( 2518 f"{home_dir}/.ncaa_stats_py/basketball_MBB/player_game_stats/" 2519 + f"{season}_{player_id}_player_game_stats.csv" 2520 ) 2521 file_mod_datetime = datetime.fromtimestamp( 2522 getmtime( 2523 f"{home_dir}/.ncaa_stats_py/basketball_MBB/" 2524 + "player_game_stats/" 2525 + f"{season}_{player_id}_player_game_stats.csv" 2526 ) 2527 ) 2528 games_df = games_df.infer_objects() 2529 load_from_cache = True 2530 else: 2531 file_mod_datetime = datetime.today() 2532 load_from_cache = False 2533 2534 if exists(f"{home_dir}/.ncaa_stats_py/"): 2535 pass 2536 else: 2537 mkdir(f"{home_dir}/.ncaa_stats_py/") 2538 2539 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 2540 pass 2541 else: 2542 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 2543 2544 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/"): 2545 pass 2546 else: 2547 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/") 2548 2549 if exists( 2550 f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/" 2551 + f"{season}_{player_id}_player_game_stats.csv" 2552 ): 2553 games_df = pd.read_csv( 2554 f"{home_dir}/.ncaa_stats_py/basketball_WBB/player_game_stats/" 2555 + f"{season}_{player_id}_player_game_stats.csv" 2556 ) 2557 file_mod_datetime = datetime.fromtimestamp( 2558 getmtime( 2559 f"{home_dir}/.ncaa_stats_py/basketball_WBB/" 2560 + "player_game_stats/" 2561 + f"{season}_{player_id}_player_game_stats.csv" 2562 ) 2563 ) 2564 games_df = games_df.infer_objects() 2565 load_from_cache = True 2566 else: 2567 logging.info("Could not find a WBB player game stats file") 2568 2569 now = datetime.today() 2570 2571 age = now - file_mod_datetime 2572 2573 if ( 2574 age.days > 1 and 2575 (season - 1) >= now.year 2576 ): 2577 load_from_cache = False 2578 2579 if load_from_cache is True: 2580 return games_df 2581 2582 # team_df = load_basketball_teams() 2583 2584 # team_df = team_df[team_df["team_id"] == team_id] 2585 2586 # season = team_df["season"].iloc[0] 2587 # ncaa_division = team_df["ncaa_division"].iloc[0] 2588 # ncaa_division_formatted = team_df["ncaa_division_formatted"].iloc[0] 2589 # team_conference_name = team_df["team_conference_name"].iloc[0] 2590 # school_name = team_df["school_name"].iloc[0] 2591 # school_id = int(team_df["school_id"].iloc[0]) 2592 2593 # del team_df 2594 response = _get_webpage(url=url) 2595 soup = BeautifulSoup(response.text, features="lxml") 2596 2597 table_navigation = soup.find("ul", {"class": "nav nav-tabs padding-nav"}) 2598 table_nav_card = table_navigation.find_all("a") 2599 2600 for u in table_nav_card: 2601 url_str = u.get("href") 2602 if "MBB" in url_str.upper(): 2603 sport_id = "MBB" 2604 elif "WBB" in url_str.upper(): 2605 sport_id = "WBB" 2606 2607 if sport_id is None or len(sport_id) == 0: 2608 # This should **never** be the case IRL, 2609 # but in case something weird happened and 2610 # we can't make a determination of if this is a 2611 # MBB player or a WBB player, and we somehow haven't 2612 # crashed by this point, set the sport ID to 2613 # "MBB" by default so we don't have other weirdness. 2614 logging.error( 2615 f"Could not determine if player ID {player_id} " + 2616 "is a MBB or a WBB player. " + 2617 "Because this cannot be determined, " + 2618 "we will make the automatic assumption that this is a MBB player." 2619 ) 2620 sport_id = "MBB" 2621 2622 table_data = soup.find_all( 2623 "table", {"class": "small_font dataTable table-bordered"} 2624 )[1] 2625 2626 temp_table_headers = table_data.find("thead").find("tr").find_all("th") 2627 table_headers = [x.text for x in temp_table_headers] 2628 2629 del temp_table_headers 2630 2631 temp_t_rows = table_data.find("tbody") 2632 temp_t_rows = temp_t_rows.find_all("tr") 2633 2634 for t in temp_t_rows: 2635 game_num = 1 2636 ot_periods = 0 2637 # innings = 9 2638 row_id = t.get("id") 2639 opp_team_name = "" 2640 2641 if "contest" not in row_id: 2642 continue 2643 del row_id 2644 2645 t_cells = t.find_all("td") 2646 t_cells = [x.text.strip() for x in t_cells] 2647 2648 g_date = t_cells[0] 2649 2650 if "(" in g_date: 2651 g_date, game_num = g_date.split("(") 2652 g_date = g_date.strip() 2653 2654 game_num = game_num.replace(")", "") 2655 game_num = int(game_num) 2656 2657 try: 2658 opp_team_id = t.find_all("td")[1].find("a").get("href") 2659 except AttributeError as e: 2660 logging.info( 2661 "Could not extract a team ID for this game. " + 2662 f"Full exception {e}" 2663 ) 2664 except Exception as e: 2665 logging.warning( 2666 "An unhandled exception has occurred when " 2667 + "trying to get the opposition team ID for this game. " 2668 f"Full exception `{e}`." 2669 ) 2670 raise e 2671 2672 try: 2673 opp_team_id = opp_team_id.replace("/teams/", "") 2674 opp_team_id = opp_team_id.replace( 2675 "javascript:toggleDefensiveStats(", "" 2676 ) 2677 opp_team_id = opp_team_id.replace(");", "") 2678 opp_team_id = int(opp_team_id) 2679 2680 temp_df["opponent_team_id"] = opp_team_id 2681 except Exception: 2682 logging.info( 2683 "Couldn't find the opposition team naIDme " 2684 + "for this row. " 2685 ) 2686 opp_team_id = None 2687 # print(i.find("td").text) 2688 try: 2689 opp_team_name = t.find_all("td")[1].find_all("img")[1].get("alt") 2690 except AttributeError: 2691 logging.info( 2692 "Couldn't find the opposition team name " 2693 + "for this row from an image element. " 2694 + "Attempting a backup method" 2695 ) 2696 opp_team_name = t_cells[1] 2697 except IndexError: 2698 logging.info( 2699 "Couldn't find the opposition team name " 2700 + "for this row from an image element. " 2701 + "Attempting a backup method" 2702 ) 2703 opp_team_name = t_cells[1] 2704 except Exception as e: 2705 logging.warning( 2706 "Unhandled exception when trying to get the " 2707 + "opposition team name from this game. " 2708 + f"Full exception `{e}`" 2709 ) 2710 raise e 2711 2712 if opp_team_name == "Defensive Stats": 2713 opp_team_name = t_cells[1] 2714 2715 if "@" in opp_team_name: 2716 opp_team_name = opp_team_name.split("@")[0] 2717 2718 result_str = t_cells[2] 2719 2720 result_str = ( 2721 result_str.lower().replace("w", "").replace("l", "").replace( 2722 "t", "" 2723 ) 2724 ) 2725 2726 if ( 2727 result_str.lower() == "ppd" or 2728 result_str.lower() == "" or 2729 result_str.lower() == "canceed" 2730 ): 2731 continue 2732 2733 result_str = result_str.replace("\n", "") 2734 result_str = result_str.replace("*", "") 2735 2736 tm_score, opp_score = result_str.split("-") 2737 t_cells = [x.replace("*", "") for x in t_cells] 2738 t_cells = [x.replace("/", "") for x in t_cells] 2739 t_cells = [x.replace("\\", "") for x in t_cells] 2740 2741 temp_df = pd.DataFrame( 2742 data=[t_cells], 2743 columns=table_headers, 2744 # index=[0] 2745 ) 2746 2747 tm_score = int(tm_score) 2748 if "(" in opp_score: 2749 opp_score = opp_score.replace(")", "") 2750 opp_score, ot_periods = opp_score.split("(") 2751 temp_df["ot_periods"] = ot_periods 2752 2753 if "\n" in opp_score: 2754 opp_score = opp_score.strip() 2755 # opp_score = opp_score 2756 opp_score = int(opp_score) 2757 2758 temp_df["team_score"] = tm_score 2759 temp_df["opponent_score"] = opp_score 2760 2761 del tm_score 2762 del opp_score 2763 2764 g_id = t.find_all("td")[2].find("a").get("href") 2765 2766 g_id = g_id.replace("/contests", "") 2767 g_id = g_id.replace("/box_score", "") 2768 g_id = g_id.replace("/", "") 2769 2770 g_id = int(g_id) 2771 temp_df["game_id"] = g_id 2772 2773 del g_id 2774 temp_df.rename( 2775 columns={"Opponent": "opponent", "Date": "date"}, 2776 inplace=True, 2777 ) 2778 game_date = datetime.strptime(g_date, "%m/%d/%Y").date() 2779 2780 temp_df["date"] = game_date 2781 temp_df["game_num"] = game_num 2782 # temp_df["game_innings"] = innings 2783 2784 if len(opp_team_name) > 0: 2785 temp_df["opponent"] = opp_team_name 2786 del opp_team_name 2787 2788 duplicate_cols = temp_df.columns[temp_df.columns.duplicated()] 2789 temp_df.drop(columns=duplicate_cols, inplace=True) 2790 2791 stats_df_arr.append(temp_df) 2792 del temp_df 2793 2794 stats_df = pd.concat(stats_df_arr, ignore_index=True) 2795 stats_df = stats_df.replace("/", "", regex=True) 2796 stats_df = stats_df.replace("", np.nan) 2797 stats_df = stats_df.infer_objects() 2798 2799 stats_df["player_id"] = player_id 2800 stats_df["season"] = season 2801 # In many seasons, there is an ["Avg"] column 2802 # that would otherwise completely screw up 2803 # any attempts to use the final DataFrame, 2804 # because it would be a duplicate column 2805 # that pandas wouldn't complain about 2806 # until it's too late. 2807 2808 duplicate_cols = stats_df.columns[stats_df.columns.duplicated()] 2809 stats_df.drop(columns=duplicate_cols, inplace=True) 2810 2811 stats_df.rename( 2812 columns={ 2813 "#": "player_jersey_number", 2814 "Player": "player_full_name", 2815 "Yr": "player_class", 2816 "Pos": "player_position", 2817 "Ht": "player_height", 2818 "B/T": "player_bats_throws", 2819 "3FG": "3PM", 2820 "3FGA": "3PA", 2821 "ORebs": "ORB", 2822 "DRebs": "DRB", 2823 "Tot Reb": "TRB", 2824 "TO": "TOV", 2825 "Dbl Dbl": "DBL_DBL", 2826 "Trpl Dbl": "TRP_DBL", 2827 "Fouls": "PF", 2828 'Tech Fouls': "TF", 2829 'Effective FG Pct.': "eFG%", 2830 "MP": "MP_str", 2831 "Min": "MP_str", 2832 "Off Reb": "ORB", 2833 "Def Reb": "DRB", 2834 "ST": "STL", 2835 "3FG%": "3P%", 2836 "BLKS": "BLK" 2837 }, 2838 inplace=True, 2839 ) 2840 2841 # This is a separate function call because these stats 2842 # *don't* exist in every season. 2843 if "GS" not in stats_df.columns: 2844 stats_df["GS"] = None 2845 2846 if "DQ" not in stats_df.columns: 2847 stats_df["DQ"] = None 2848 2849 if "TF" not in stats_df.columns: 2850 stats_df["TF"] = None 2851 2852 if "DBL_DBL" not in stats_df.columns: 2853 stats_df["DBL_DBL"] = None 2854 2855 if "TRP_DBL" not in stats_df.columns: 2856 stats_df["TRP_DBL"] = None 2857 2858 stats_df = stats_df.astype( 2859 { 2860 "DQ": "uint16", 2861 "TF": "uint16", 2862 }, 2863 errors="ignore" 2864 ) 2865 2866 stats_df = stats_df.infer_objects().fillna(0) 2867 stats_df = stats_df.astype( 2868 { 2869 "GP": "uint16", 2870 "GS": "uint16", 2871 "FGM": "uint16", 2872 "FGA": "uint16", 2873 "3PM": "uint16", 2874 "3PA": "uint16", 2875 "FT": "uint16", 2876 "FTA": "uint16", 2877 "PTS": "uint16", 2878 "ORB": "uint16", 2879 "DRB": "uint16", 2880 "TRB": "uint16", 2881 "AST": "uint16", 2882 "TOV": "uint16", 2883 "STL": "uint16", 2884 "BLK": "uint16", 2885 "PF": "uint16", 2886 "DBL_DBL": "uint16", 2887 "TRP_DBL": "uint16", 2888 # "school_id": "uint32", 2889 } 2890 ) 2891 2892 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 2893 ":", expand=True 2894 ) 2895 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2896 "MP_minutes", "MP_seconds" 2897 ]].fillna(0) 2898 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 2899 "MP_minutes", "MP_seconds" 2900 ]].astype("uint16") 2901 stats_df["MP_total_seconds"] = ( 2902 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 2903 ) 2904 2905 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 2906 stats_df["FG%"] = stats_df["FG%"].round(4) 2907 2908 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 2909 stats_df["3P%"] = stats_df["3P%"].round(4) 2910 2911 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 2912 stats_df["FT%"] = stats_df["FT%"].round(4) 2913 2914 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 2915 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 2916 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 2917 stats_df["2P%"] = stats_df["2P%"].round(4) 2918 2919 stats_df["eFG%"] = ( 2920 ( 2921 stats_df["FGM"] + 2922 (stats_df["3PM"] * 0.5) 2923 ) / 2924 stats_df["FGA"] 2925 ) 2926 stats_df["eFG%"] = stats_df["eFG%"].round(4) 2927 2928 stats_df["TSA"] = ( 2929 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 2930 ) 2931 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 2932 stats_df["TS%"] = stats_df["TS%"].round(4) 2933 2934 stats_df["TOV%"] = ( 2935 stats_df["TOV"] / 2936 ( 2937 stats_df["FGA"] + 2938 (stats_df["FTA"] * 0.44) + 2939 stats_df["TOV"] 2940 ) 2941 ) 2942 stats_df["TOV%"] = stats_df["TOV%"].round(4) 2943 stats_df = stats_df.reindex( 2944 columns=stat_columns 2945 ) 2946 # print(stats_df.columns) 2947 stats_df.to_csv( 2948 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/" 2949 + "player_game_stats/" 2950 + f"{season}_{player_id}_player_game_stats.csv", 2951 index=False, 2952 ) 2953 return stats_df
Given a valid player ID and season, this function retrieves the game stats for this player at a game level.
Parameters
player_id
(int, mandatory):
Required argument.
Specifies the player you want game stats from.
season
(int, mandatory):
Required argument.
Specifies the season you want game stats from.
Usage
from ncaa_stats_py.basketball import (
get_basketball_player_game_stats
)
# Get the batting stats of Jacob Berry in 2022 (LSU).
print(
"Get the batting stats of Jacob Berry in 2022 (LSU)."
)
df = get_basketball_player_game_stats(player_id=7579336, season=2022)
print(df)
# Get the batting stats of Alec Burleson in 2019 (ECU).
print(
"Get the batting stats of Alec Burleson in 2019 (ECU)."
)
df = get_basketball_player_game_stats(player_id=6015715, season=2019)
print(df)
# Get the batting stats of Hunter Bishop in 2018 (Arizona St.).
print(
"Get the batting stats of Hunter Bishop in 2018 (Arizona St.)."
)
df = get_basketball_player_game_stats(player_id=6014052, season=2019)
print(df)
Returns
A pandas DataFrame
object with a player's batting game logs
in a given season.
2956def get_basketball_game_player_stats(game_id: int) -> pd.DataFrame: 2957 """ 2958 Given a valid game ID, 2959 this function will attempt to get all player game stats, if possible. 2960 2961 NOTE: Due to an issue with [stats.ncaa.org](stats.ncaa.org), 2962 full player game stats may not be loaded in through this function. 2963 2964 This is a known issue, however you should be able to get position 2965 data and starters information through this function 2966 2967 Parameters 2968 ---------- 2969 `game_id` (int, mandatory): 2970 Required argument. 2971 Specifies the game you want player game stats from. 2972 2973 Usage 2974 ---------- 2975 ```python 2976 2977 from ncaa_stats_py.basketball import get_basketball_game_player_stats 2978 2979 ######################################## 2980 # Men's Basketball # 2981 ######################################## 2982 2983 # Get the game stats of the 2984 # 2024 NCAA D1 Men's Basketball National Championship game. 2985 print( 2986 "Get the game stats of the " 2987 + "2024 NCAA D1 Men's Basketball National Championship game." 2988 ) 2989 df = get_basketball_game_player_stats(5254137) 2990 print(df) 2991 2992 # Get the game stats of a March Madness game on March 29th, 2024 2993 # between Duke and the Houston Cougars. 2994 print( 2995 "Get the game stats of a March Madness game on March 29th, 2024 " 2996 + "between Duke and the Houston Cougars." 2997 ) 2998 df = get_basketball_game_player_stats(5254126) 2999 print(df) 3000 3001 # Get the game stats of a St. Patrick's Day 3002 # game between the Duquesne Dukes and VCU Rams (D1). 3003 print( 3004 "Get the game stats of a St. Patrick's Day " 3005 + "game between the Duquesne Dukes and VCU Rams (D1)." 3006 ) 3007 df = get_basketball_game_player_stats(5252318) 3008 print(df) 3009 3010 # Get the game stats of a December 17th, 2023 3011 # game between the Barry Buccaneers and Findlay Oilers (D2). 3012 print( 3013 "Get the game stats of a December 17th, 2023 " 3014 + "game between the Barry Buccaneers and Findlay Oilers (D2)." 3015 ) 3016 df = get_basketball_game_player_stats(3960610) 3017 print(df) 3018 3019 # Get the game stats of a Valentine's Day 3020 # game between the Kalamazoo Hornets and the Trine Thunder (D2). 3021 print( 3022 "Get the game stats of a Valentine's Day " 3023 + "game between the Kalamazoo Hornets and the Trine Thunder (D2)." 3024 ) 3025 df = get_basketball_game_player_stats(3967963) 3026 print(df) 3027 3028 3029 ######################################## 3030 # Women's Basketball # 3031 ######################################## 3032 3033 # Get the game stats of the 3034 # 2024 NCAA D1 Women's Basketball National Championship game. 3035 print( 3036 "Get the game stats of the " 3037 + "2024 NCAA D1 Women's Basketball National Championship game" 3038 ) 3039 df = get_basketball_game_player_stats(5254137) 3040 print(df) 3041 3042 # Get the game stats of a March 3rd, 2024 3043 # game between Duke and the North Carolina Tar Heels. 3044 print( 3045 "Get the game stats of a March 3rd, 2024 " 3046 + "game between Duke and the North Carolina Tar Heels" 3047 ) 3048 df = get_basketball_game_player_stats(3984600) 3049 print(df) 3050 3051 # Get the game stats of a Thanksgiving Day 3052 # game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2). 3053 print( 3054 "Get the game stats of a Thanksgiving Day " 3055 + "game between the Sacred Heart Pioneers and " 3056 + "the P.R.-Mayaguez Janes (D2)." 3057 ) 3058 df = get_basketball_game_player_stats(3972687) 3059 print(df) 3060 3061 # Get the game stats of a January 21st, 2024 3062 # game between the Puget Sound Loggers 3063 # and the Whitworth Pirates (D3). 3064 print( 3065 "Get the game stats of a January 21st, 2024 " 3066 + "game between the Puget Sound Loggers and " 3067 + "the Whitworth Pirates (D3)." 3068 ) 3069 df = get_basketball_game_player_stats(3979051) 3070 print(df) 3071 ``` 3072 3073 Returns 3074 ---------- 3075 A pandas `DataFrame` object with player game stats in a given game. 3076 3077 """ 3078 load_from_cache = True 3079 3080 sport_id = "" 3081 season = 0 3082 3083 mbb_teams_df = load_basketball_teams(get_wbb_data=False) 3084 mbb_team_ids_arr = mbb_teams_df["team_id"].to_list() 3085 3086 wbb_teams_df = load_basketball_teams(get_wbb_data=True) 3087 wbb_team_ids_arr = wbb_teams_df["team_id"].to_list() 3088 3089 stats_df = pd.DataFrame() 3090 stats_df_arr = [] 3091 3092 temp_df = pd.DataFrame() 3093 home_dir = expanduser("~") 3094 home_dir = _format_folder_str(home_dir) 3095 3096 stat_columns = [ 3097 "season", 3098 "game_id", 3099 "team_id", 3100 "team_name", 3101 "player_id", 3102 "player_num", 3103 "player_full_name", 3104 "player_position", 3105 "GP", 3106 "GS", 3107 "MP_str", 3108 "MP_minutes", 3109 "MP_seconds", 3110 "MP_total_seconds", 3111 "FGM", 3112 "FGA", 3113 "FG%", 3114 "3PM", 3115 "3PA", 3116 "3P%", 3117 "2PM", 3118 "2PA", 3119 "2P%", 3120 "eFG%", 3121 "FT", 3122 "FTA", 3123 "FT%", 3124 "TSA", 3125 "TS%", 3126 "ORB", 3127 "DRB", 3128 "TRB", 3129 "AST", 3130 "STL", 3131 "BLK", 3132 "TOV", 3133 "TOV%", 3134 "PF", 3135 "TF", 3136 "PTS", 3137 "DQ", 3138 "DBL_DBL", 3139 "TRP_DBL", 3140 ] 3141 3142 url = f"https://stats.ncaa.org/contests/{game_id}/individual_stats" 3143 3144 if exists(f"{home_dir}/.ncaa_stats_py/"): 3145 pass 3146 else: 3147 mkdir(f"{home_dir}/.ncaa_stats_py/") 3148 3149 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 3150 pass 3151 else: 3152 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 3153 3154 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/"): 3155 pass 3156 else: 3157 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/") 3158 3159 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/"): 3160 pass 3161 else: 3162 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/") 3163 3164 if exists( 3165 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3166 + f"{game_id}_player_game_stats.csv" 3167 ): 3168 games_df = pd.read_csv( 3169 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3170 + f"{game_id}_player_game_stats.csv" 3171 ) 3172 games_df = games_df.infer_objects() 3173 file_mod_datetime = datetime.fromtimestamp( 3174 getmtime( 3175 f"{home_dir}/.ncaa_stats_py/basketball_MBB/game_stats/player/" 3176 + f"{game_id}_player_game_stats.csv" 3177 ) 3178 ) 3179 load_from_cache = True 3180 else: 3181 file_mod_datetime = datetime.today() 3182 load_from_cache = False 3183 3184 if exists(f"{home_dir}/.ncaa_stats_py/"): 3185 pass 3186 else: 3187 mkdir(f"{home_dir}/.ncaa_stats_py/") 3188 3189 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 3190 pass 3191 else: 3192 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 3193 3194 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/"): 3195 pass 3196 else: 3197 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/") 3198 3199 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/"): 3200 pass 3201 else: 3202 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/") 3203 3204 if exists( 3205 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3206 + f"{game_id}_player_game_stats.csv" 3207 ): 3208 games_df = pd.read_csv( 3209 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3210 + f"{game_id}_player_game_stats.csv" 3211 ) 3212 games_df = games_df.infer_objects() 3213 file_mod_datetime = datetime.fromtimestamp( 3214 getmtime( 3215 f"{home_dir}/.ncaa_stats_py/basketball_WBB/game_stats/player/" 3216 + f"{game_id}_player_game_stats.csv" 3217 ) 3218 ) 3219 load_from_cache = True 3220 else: 3221 logging.info("Could not find a WBB player game stats file") 3222 3223 now = datetime.today() 3224 3225 age = now - file_mod_datetime 3226 3227 if age.days >= 35: 3228 load_from_cache = False 3229 3230 if load_from_cache is True: 3231 return games_df 3232 3233 response = _get_webpage(url=url) 3234 soup = BeautifulSoup(response.text, features="lxml") 3235 3236 # table_data = soup.find_all( 3237 # "table", 3238 # {"class": "small_font dataTable table-bordered"} 3239 # )[1] 3240 table_boxes = soup.find_all("div", {"class": "card p-0 table-responsive"}) 3241 3242 for box in table_boxes: 3243 t_header = box.find( 3244 "div", {"class": "card-header"} 3245 ).find( 3246 "div", {"class": "row"} 3247 ) 3248 3249 t_header_str = t_header.text 3250 t_header_str = t_header_str.replace("Period Stats", "") 3251 t_header_str = t_header_str.replace("\n", "") 3252 t_header_str = t_header_str.strip() 3253 3254 team_id = t_header.find("a").get("href") 3255 team_id = team_id.replace("/teams", "") 3256 team_id = team_id.replace("/", "") 3257 team_id = int(team_id) 3258 3259 table_data = box.find( 3260 "table", 3261 {"class": "display dataTable small_font"} 3262 ) 3263 table_headers = box.find("thead").find_all("th") 3264 table_headers = [x.text for x in table_headers] 3265 3266 temp_t_rows = table_data.find("tbody") 3267 temp_t_rows = temp_t_rows.find_all("tr") 3268 3269 spec_stats_df = pd.DataFrame() 3270 spec_stats_df_arr = [] 3271 for t in temp_t_rows: 3272 # row_id = t.get("id") 3273 game_played = 1 3274 game_started = 0 3275 3276 try: 3277 player_id = t.find("a").get("href") 3278 player_id = player_id.replace("/players", "") 3279 player_id = player_id.replace("/player", "") 3280 player_id = player_id.replace("/", "") 3281 except Exception as e: 3282 logging.debug( 3283 "Could not replace player IDs. " + 3284 f"Full exception: `{e}`" 3285 ) 3286 3287 t_cells = t.find_all("td") 3288 p_name = t_cells[1].text.replace("\n", "") 3289 p_name = p_name.strip() 3290 3291 if t_header_str in p_name: 3292 continue 3293 elif p_name.lower() == "team": 3294 continue 3295 if "\xa0" in p_name: 3296 game_started = 0 3297 3298 t_cells = [x.text.strip() for x in t_cells] 3299 player_id = int(player_id) 3300 3301 temp_df = pd.DataFrame( 3302 data=[t_cells], 3303 columns=table_headers 3304 ) 3305 3306 duplicate_cols = temp_df.columns[temp_df.columns.duplicated()] 3307 temp_df.drop(columns=duplicate_cols, inplace=True) 3308 3309 temp_df["player_id"] = player_id 3310 temp_df["GP"] = game_played 3311 temp_df["GS"] = game_started 3312 3313 spec_stats_df_arr.append(temp_df) 3314 del temp_df 3315 3316 spec_stats_df = pd.concat( 3317 spec_stats_df_arr, 3318 ignore_index=True 3319 ) 3320 3321 if team_id in mbb_team_ids_arr: 3322 sport_id = "MBB" 3323 df = mbb_teams_df[mbb_teams_df["team_id"] == team_id] 3324 season = df["season"].iloc[0] 3325 elif team_id in wbb_team_ids_arr: 3326 sport_id = "WBB" 3327 df = wbb_teams_df[wbb_teams_df["team_id"] == team_id] 3328 season = df["season"].iloc[0] 3329 else: 3330 raise ValueError( 3331 f"Unhandled team ID {team_id}" 3332 ) 3333 spec_stats_df["team_id"] = team_id 3334 spec_stats_df["team_name"] = t_header_str 3335 stats_df_arr.append(spec_stats_df) 3336 del spec_stats_df 3337 3338 stats_df = pd.concat(stats_df_arr) 3339 stats_df["season"] = season 3340 stats_df.rename( 3341 columns={ 3342 "#": "player_num", 3343 "Name": "player_full_name", 3344 "P": "player_position", 3345 "MP": "MP_str", 3346 "3FG": "3PM", 3347 "3FGA": "3PA", 3348 "ORebs": "ORB", 3349 "DRebs": "DRB", 3350 "TotReb": "TRB", 3351 "TO": "TOV", 3352 "TechFouls": "TF", 3353 "Fouls": "PF" 3354 }, 3355 inplace=True, 3356 ) 3357 3358 if "GS" not in stats_df.columns: 3359 stats_df["GS"] = None 3360 3361 if "DQ" not in stats_df.columns: 3362 stats_df["DQ"] = None 3363 3364 if "TF" not in stats_df.columns: 3365 stats_df["TF"] = None 3366 3367 if "DBL_DBL" not in stats_df.columns: 3368 stats_df["DBL_DBL"] = None 3369 3370 if "TRP_DBL" not in stats_df.columns: 3371 stats_df["TRP_DBL"] = None 3372 3373 stats_df = stats_df.astype( 3374 { 3375 "DQ": "uint16", 3376 "TF": "uint16", 3377 }, 3378 errors="ignore" 3379 ) 3380 3381 stats_df = stats_df.infer_objects().fillna(0) 3382 stats_df = stats_df.astype( 3383 { 3384 "GP": "uint16", 3385 "GS": "uint16", 3386 "FGM": "uint16", 3387 "FGA": "uint16", 3388 "3PM": "uint16", 3389 "3PA": "uint16", 3390 "FT": "uint16", 3391 "FTA": "uint16", 3392 "PTS": "uint16", 3393 "ORB": "uint16", 3394 "DRB": "uint16", 3395 "TRB": "uint16", 3396 "AST": "uint16", 3397 "TOV": "uint16", 3398 "STL": "uint16", 3399 "BLK": "uint16", 3400 "PF": "uint16", 3401 "DBL_DBL": "uint16", 3402 "TRP_DBL": "uint16", 3403 # "school_id": "uint32", 3404 } 3405 ) 3406 3407 stats_df[["MP_minutes", "MP_seconds"]] = stats_df["MP_str"].str.split( 3408 ":", expand=True 3409 ) 3410 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 3411 "MP_minutes", "MP_seconds" 3412 ]].fillna(0) 3413 stats_df[["MP_minutes", "MP_seconds"]] = stats_df[[ 3414 "MP_minutes", "MP_seconds" 3415 ]].astype("uint16") 3416 stats_df["MP_total_seconds"] = ( 3417 stats_df["MP_seconds"] + (stats_df["MP_minutes"] * 60) 3418 ) 3419 3420 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 3421 stats_df["FG%"] = stats_df["FG%"].round(4) 3422 3423 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 3424 stats_df["3P%"] = stats_df["3P%"].round(4) 3425 3426 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 3427 stats_df["FT%"] = stats_df["FT%"].round(4) 3428 3429 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 3430 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 3431 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 3432 stats_df["2P%"] = stats_df["2P%"].round(4) 3433 3434 stats_df["eFG%"] = ( 3435 ( 3436 stats_df["FGM"] + 3437 (stats_df["3PM"] * 0.5) 3438 ) / 3439 stats_df["FGA"] 3440 ) 3441 stats_df["eFG%"] = stats_df["eFG%"].round(4) 3442 3443 stats_df["TSA"] = ( 3444 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 3445 ) 3446 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 3447 stats_df["TS%"] = stats_df["TS%"].round(4) 3448 3449 stats_df["TOV%"] = ( 3450 stats_df["TOV"] / 3451 ( 3452 stats_df["FGA"] + 3453 (stats_df["FTA"] * 0.44) + 3454 stats_df["TOV"] 3455 ) 3456 ) 3457 stats_df["TOV%"] = stats_df["TOV%"].round(4) 3458 3459 double_double_stats = ["PTS", "TRB", "AST", "BLK", "STL"] 3460 stats_df["DBL_DBL"] = (stats_df[double_double_stats] >= 10).sum(1) >= 2 3461 stats_df["TRP_DBL"] = (stats_df[double_double_stats] >= 10).sum(1) >= 3 3462 3463 stats_df = stats_df.astype( 3464 { 3465 "DBL_DBL": "uint16", 3466 "TRP_DBL": "uint16", 3467 }, 3468 errors="ignore" 3469 ) 3470 stats_df = stats_df.reindex( 3471 columns=stat_columns 3472 ) 3473 stats_df["game_id"] = game_id 3474 # print(stats_df.columns) 3475 stats_df.to_csv( 3476 f"{home_dir}/.ncaa_stats_py/basketball_{sport_id}/game_stats/player/" 3477 + f"{game_id}_player_game_stats.csv", 3478 index=False 3479 ) 3480 return stats_df
Given a valid game ID, this function will attempt to get all player game stats, if possible.
NOTE: Due to an issue with stats.ncaa.org, full player game stats may not be loaded in through this function.
This is a known issue, however you should be able to get position data and starters information through this function
Parameters
game_id
(int, mandatory):
Required argument.
Specifies the game you want player game stats from.
Usage
from ncaa_stats_py.basketball import get_basketball_game_player_stats
########################################
# Men's Basketball #
########################################
# Get the game stats of the
# 2024 NCAA D1 Men's Basketball National Championship game.
print(
"Get the game stats of the "
+ "2024 NCAA D1 Men's Basketball National Championship game."
)
df = get_basketball_game_player_stats(5254137)
print(df)
# Get the game stats of a March Madness game on March 29th, 2024
# between Duke and the Houston Cougars.
print(
"Get the game stats of a March Madness game on March 29th, 2024 "
+ "between Duke and the Houston Cougars."
)
df = get_basketball_game_player_stats(5254126)
print(df)
# Get the game stats of a St. Patrick's Day
# game between the Duquesne Dukes and VCU Rams (D1).
print(
"Get the game stats of a St. Patrick's Day "
+ "game between the Duquesne Dukes and VCU Rams (D1)."
)
df = get_basketball_game_player_stats(5252318)
print(df)
# Get the game stats of a December 17th, 2023
# game between the Barry Buccaneers and Findlay Oilers (D2).
print(
"Get the game stats of a December 17th, 2023 "
+ "game between the Barry Buccaneers and Findlay Oilers (D2)."
)
df = get_basketball_game_player_stats(3960610)
print(df)
# Get the game stats of a Valentine's Day
# game between the Kalamazoo Hornets and the Trine Thunder (D2).
print(
"Get the game stats of a Valentine's Day "
+ "game between the Kalamazoo Hornets and the Trine Thunder (D2)."
)
df = get_basketball_game_player_stats(3967963)
print(df)
########################################
# Women's Basketball #
########################################
# Get the game stats of the
# 2024 NCAA D1 Women's Basketball National Championship game.
print(
"Get the game stats of the "
+ "2024 NCAA D1 Women's Basketball National Championship game"
)
df = get_basketball_game_player_stats(5254137)
print(df)
# Get the game stats of a March 3rd, 2024
# game between Duke and the North Carolina Tar Heels.
print(
"Get the game stats of a March 3rd, 2024 "
+ "game between Duke and the North Carolina Tar Heels"
)
df = get_basketball_game_player_stats(3984600)
print(df)
# Get the game stats of a Thanksgiving Day
# game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2).
print(
"Get the game stats of a Thanksgiving Day "
+ "game between the Sacred Heart Pioneers and "
+ "the P.R.-Mayaguez Janes (D2)."
)
df = get_basketball_game_player_stats(3972687)
print(df)
# Get the game stats of a January 21st, 2024
# game between the Puget Sound Loggers
# and the Whitworth Pirates (D3).
print(
"Get the game stats of a January 21st, 2024 "
+ "game between the Puget Sound Loggers and "
+ "the Whitworth Pirates (D3)."
)
df = get_basketball_game_player_stats(3979051)
print(df)
Returns
A pandas DataFrame
object with player game stats in a given game.
3483def get_basketball_game_team_stats(game_id: int) -> pd.DataFrame: 3484 """ 3485 Given a valid game ID, 3486 this function will attempt to get all team game stats, if possible. 3487 3488 NOTE: Due to an issue with [stats.ncaa.org](stats.ncaa.org), 3489 full team game stats may not be loaded in through this function. 3490 3491 This is a known issue, however you should be able to get position 3492 data and starters information through this function 3493 3494 Parameters 3495 ---------- 3496 `game_id` (int, mandatory): 3497 Required argument. 3498 Specifies the game you want team game stats from. 3499 3500 Usage 3501 ---------- 3502 ```python 3503 3504 from ncaa_stats_py.basketball import get_basketball_game_team_stats 3505 3506 ######################################## 3507 # Men's Basketball # 3508 ######################################## 3509 3510 # Get the game stats of the 3511 # 2024 NCAA D1 Men's Basketball National Championship game. 3512 print( 3513 "Get the game stats of the " 3514 + "2024 NCAA D1 Men's Basketball National Championship game." 3515 ) 3516 df = get_basketball_game_team_stats(5254137) 3517 print(df) 3518 3519 # Get the game stats of a March Madness game on March 29th, 2024 3520 # between Duke and the Houston Cougars. 3521 print( 3522 "Get the game stats of a March Madness game on March 29th, 2024 " 3523 + "between Duke and the Houston Cougars." 3524 ) 3525 df = get_basketball_game_team_stats(5254126) 3526 print(df) 3527 3528 # Get the game stats of a St. Patrick's Day 3529 # game between the Duquesne Dukes and VCU Rams (D1). 3530 print( 3531 "Get the game stats of a St. Patrick's Day " 3532 + "game between the Duquesne Dukes and VCU Rams (D1)." 3533 ) 3534 df = get_basketball_game_team_stats(5252318) 3535 print(df) 3536 3537 # Get the game stats of a December 17th, 2023 3538 # game between the Barry Buccaneers and Findlay Oilers (D2). 3539 print( 3540 "Get the game stats of a December 17th, 2023 " 3541 + "game between the Barry Buccaneers and Findlay Oilers (D2)." 3542 ) 3543 df = get_basketball_game_team_stats(3960610) 3544 print(df) 3545 3546 # Get the game stats of a Valentine's Day 3547 # game between the Kalamazoo Hornets and the Trine Thunder (D2). 3548 print( 3549 "Get the game stats of a Valentine's Day " 3550 + "game between the Kalamazoo Hornets and the Trine Thunder (D2)." 3551 ) 3552 df = get_basketball_game_team_stats(3967963) 3553 print(df) 3554 3555 3556 ######################################## 3557 # Women's Basketball # 3558 ######################################## 3559 3560 # Get the game stats of the 3561 # 2024 NCAA D1 Women's Basketball National Championship game. 3562 print( 3563 "Get the game stats of the " 3564 + "2024 NCAA D1 Women's Basketball National Championship game" 3565 ) 3566 df = get_basketball_game_team_stats(5254137) 3567 print(df) 3568 3569 # Get the game stats of a March 3rd, 2024 3570 # game between Duke and the North Carolina Tar Heels. 3571 print( 3572 "Get the game stats of a March 3rd, 2024 " 3573 + "game between Duke and the North Carolina Tar Heels" 3574 ) 3575 df = get_basketball_game_team_stats(3984600) 3576 print(df) 3577 3578 # Get the game stats of a Thanksgiving Day 3579 # game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2). 3580 print( 3581 "Get the game stats of a Thanksgiving Day " 3582 + "game between the Sacred Heart Pioneers and " 3583 + "the P.R.-Mayaguez Janes (D2)." 3584 ) 3585 df = get_basketball_game_team_stats(3972687) 3586 print(df) 3587 3588 # Get the game stats of a January 21st, 2024 3589 # game between the Puget Sound Loggers 3590 # and the Whitworth Pirates (D3). 3591 print( 3592 "Get the game stats of a January 21st, 2024 " 3593 + "game between the Puget Sound Loggers and " 3594 + "the Whitworth Pirates (D3)." 3595 ) 3596 df = get_basketball_game_team_stats(3979051) 3597 3598 ``` 3599 3600 Returns 3601 ---------- 3602 A pandas `DataFrame` object with team game stats in a given game. 3603 3604 """ 3605 df = get_basketball_game_player_stats(game_id=game_id) 3606 # print(df.columns) 3607 df = df.infer_objects() 3608 stats_df = df.groupby( 3609 ["season", "game_id", "team_id", "team_name"], 3610 as_index=False 3611 ).agg( 3612 { 3613 # "MP_minutes": "sum", 3614 # "MP_seconds": "sum", 3615 "MP_total_seconds": "sum", 3616 "FGM": "sum", 3617 "FGA": "sum", 3618 "3PM": "sum", 3619 "3PA": "sum", 3620 "2PM": "sum", 3621 "2PA": "sum", 3622 "FT": "sum", 3623 "FTA": "sum", 3624 "ORB": "sum", 3625 "DRB": "sum", 3626 "TRB": "sum", 3627 "AST": "sum", 3628 "STL": "sum", 3629 "BLK": "sum", 3630 "TOV": "sum", 3631 "PF": "sum", 3632 "TF": "sum", 3633 "PTS": "sum", 3634 "DQ": "sum", 3635 "DBL_DBL": "sum", 3636 "TRP_DBL": "sum", 3637 } 3638 ) 3639 stats_df["MP_str"] = stats_df["MP_total_seconds"].map( 3640 _get_minute_formatted_time_from_seconds 3641 ) 3642 3643 stats_df["FG%"] = (stats_df["FGM"] / stats_df["FGA"]) 3644 stats_df["FG%"] = stats_df["FG%"].round(4) 3645 3646 stats_df["3P%"] = (stats_df["3PM"] / stats_df["3PA"]) 3647 stats_df["3P%"] = stats_df["3P%"].round(4) 3648 3649 stats_df["FT%"] = (stats_df["FT"] / stats_df["FTA"]) 3650 stats_df["FT%"] = stats_df["FT%"].round(4) 3651 3652 stats_df["2PM"] = (stats_df["FGM"] - stats_df["3PM"]) 3653 stats_df["2PA"] = (stats_df["FGA"] - stats_df["3PA"]) 3654 stats_df["2P%"] = (stats_df["2PM"] / stats_df["2PA"]) 3655 stats_df["2P%"] = stats_df["2P%"].round(4) 3656 3657 stats_df["eFG%"] = ( 3658 ( 3659 stats_df["FGM"] + 3660 (stats_df["3PM"] * 0.5) 3661 ) / 3662 stats_df["FGA"] 3663 ) 3664 stats_df["eFG%"] = stats_df["eFG%"].round(4) 3665 3666 stats_df["TSA"] = ( 3667 stats_df["FGA"] + (stats_df["FTA"] * 0.44) 3668 ) 3669 stats_df["TS%"] = stats_df["PTS"] / (2 * stats_df["TSA"]) 3670 stats_df["TS%"] = stats_df["TS%"].round(4) 3671 3672 stats_df["TOV%"] = ( 3673 stats_df["TOV"] / 3674 ( 3675 stats_df["FGA"] + 3676 (stats_df["FTA"] * 0.44) + 3677 stats_df["TOV"] 3678 ) 3679 ) 3680 stats_df["TOV%"] = stats_df["TOV%"].round(4) 3681 3682 return stats_df
Given a valid game ID, this function will attempt to get all team game stats, if possible.
NOTE: Due to an issue with stats.ncaa.org, full team game stats may not be loaded in through this function.
This is a known issue, however you should be able to get position data and starters information through this function
Parameters
game_id
(int, mandatory):
Required argument.
Specifies the game you want team game stats from.
Usage
from ncaa_stats_py.basketball import get_basketball_game_team_stats
########################################
# Men's Basketball #
########################################
# Get the game stats of the
# 2024 NCAA D1 Men's Basketball National Championship game.
print(
"Get the game stats of the "
+ "2024 NCAA D1 Men's Basketball National Championship game."
)
df = get_basketball_game_team_stats(5254137)
print(df)
# Get the game stats of a March Madness game on March 29th, 2024
# between Duke and the Houston Cougars.
print(
"Get the game stats of a March Madness game on March 29th, 2024 "
+ "between Duke and the Houston Cougars."
)
df = get_basketball_game_team_stats(5254126)
print(df)
# Get the game stats of a St. Patrick's Day
# game between the Duquesne Dukes and VCU Rams (D1).
print(
"Get the game stats of a St. Patrick's Day "
+ "game between the Duquesne Dukes and VCU Rams (D1)."
)
df = get_basketball_game_team_stats(5252318)
print(df)
# Get the game stats of a December 17th, 2023
# game between the Barry Buccaneers and Findlay Oilers (D2).
print(
"Get the game stats of a December 17th, 2023 "
+ "game between the Barry Buccaneers and Findlay Oilers (D2)."
)
df = get_basketball_game_team_stats(3960610)
print(df)
# Get the game stats of a Valentine's Day
# game between the Kalamazoo Hornets and the Trine Thunder (D2).
print(
"Get the game stats of a Valentine's Day "
+ "game between the Kalamazoo Hornets and the Trine Thunder (D2)."
)
df = get_basketball_game_team_stats(3967963)
print(df)
########################################
# Women's Basketball #
########################################
# Get the game stats of the
# 2024 NCAA D1 Women's Basketball National Championship game.
print(
"Get the game stats of the "
+ "2024 NCAA D1 Women's Basketball National Championship game"
)
df = get_basketball_game_team_stats(5254137)
print(df)
# Get the game stats of a March 3rd, 2024
# game between Duke and the North Carolina Tar Heels.
print(
"Get the game stats of a March 3rd, 2024 "
+ "game between Duke and the North Carolina Tar Heels"
)
df = get_basketball_game_team_stats(3984600)
print(df)
# Get the game stats of a Thanksgiving Day
# game between the Sacred Heart Pioneers and the P.R.-Mayaguez Janes (D2).
print(
"Get the game stats of a Thanksgiving Day "
+ "game between the Sacred Heart Pioneers and "
+ "the P.R.-Mayaguez Janes (D2)."
)
df = get_basketball_game_team_stats(3972687)
print(df)
# Get the game stats of a January 21st, 2024
# game between the Puget Sound Loggers
# and the Whitworth Pirates (D3).
print(
"Get the game stats of a January 21st, 2024 "
+ "game between the Puget Sound Loggers and "
+ "the Whitworth Pirates (D3)."
)
df = get_basketball_game_team_stats(3979051)
Returns
A pandas DataFrame
object with team game stats in a given game.
3685def get_basketball_raw_pbp(game_id: int) -> pd.DataFrame: 3686 """ 3687 Given a valid game ID, 3688 this function will attempt to get the raw play-by-play (PBP) 3689 data for that game. 3690 3691 Parameters 3692 ---------- 3693 `game_id` (int, mandatory): 3694 Required argument. 3695 Specifies the game you want play-by-play data (PBP) from. 3696 3697 Usage 3698 ---------- 3699 ```python 3700 3701 from ncaa_stats_py.basketball import get_basketball_raw_pbp 3702 3703 ######################################## 3704 # Men's Basketball # 3705 ######################################## 3706 3707 # Get the play-by-play data of the 3708 # 2024 NCAA D1 Men's Basketball National Championship game. 3709 print( 3710 "Get the play-by-play data of the " 3711 + "2024 NCAA D1 Men's Basketball National Championship game." 3712 ) 3713 df = get_basketball_raw_pbp(5254137) 3714 print(df) 3715 3716 # Get the play-by-play data of a March Madness game on March 29th, 2024 3717 # between Duke and the Houston Cougars. 3718 print( 3719 "Get the play-by-play data " 3720 + "of a March Madness game on March 29th, 2024 " 3721 + "between Duke and the Houston Cougars." 3722 ) 3723 df = get_basketball_raw_pbp(5254126) 3724 print(df) 3725 3726 # Get the play-by-play data of a February 28th 3727 # game between the Winthrop Eagles and High Point Panthers. 3728 print( 3729 "Get the play-by-play data of a February 28th " 3730 + "game between the Winthrop Eagles and High Point Panthers." 3731 ) 3732 df = get_basketball_raw_pbp(3969302) 3733 print(df) 3734 3735 # Get the play-by-play data of a December 19th, 2022 3736 # game between the San Francisco St. Gators and 3737 # the Cal St. Monterey Bay Otters (D2). 3738 print( 3739 "Get the play-by-play data of a December 19th, 2022 " 3740 + "game between the San Francisco St. Gators and " + 3741 "the Cal St. Monterey Bay Otters (D2)." 3742 ) 3743 df = get_basketball_raw_pbp(2341500) 3744 print(df) 3745 3746 # Get the play-by-play data of a January 3rd, 2022 3747 # game between the Hamline Pipers and the St. Olaf Oles (D3). 3748 print( 3749 "Get the play-by-play data of a January 3rd, 2022 " 3750 + "game between the Hamline Pipers and the St. Olaf Oles (D3)." 3751 ) 3752 df = get_basketball_raw_pbp(3967963) 3753 print(df) 3754 3755 3756 ######################################## 3757 # Women's Basketball # 3758 ######################################## 3759 3760 # Get the play-by-play data of the 3761 # 2024 NCAA D1 Women's Basketball National Championship game. 3762 print( 3763 "Get the play-by-play data of the " 3764 + "2024 NCAA D1 Women's Basketball National Championship game." 3765 ) 3766 df = get_basketball_raw_pbp(5254137) 3767 print(df) 3768 3769 # Get the play-by-play data of a March 12th, 2021 3770 # game between the La Salle Explorers and the Dayton Flyers. 3771 print( 3772 "Get the play-by-play data of a March 12th, 2021 " 3773 + "game between the La Salle Explorers and the Dayton Flyers." 3774 ) 3775 df = get_basketball_raw_pbp(2055636) 3776 print(df) 3777 3778 # Get the play-by-play data of a February 6th, 2020 3779 # game between Purdue Northwest and the Michigan Tech Huskies (D2). 3780 print( 3781 "Get the play-by-play data of a Thanksgiving Day " 3782 + "game between the Sacred Heart Pioneers and " 3783 + "the P.R.-Mayaguez Janes (D2)." 3784 ) 3785 df = get_basketball_raw_pbp(1793405) 3786 print(df) 3787 3788 # Get the play-by-play data of a January 5th, 2019 3789 # game between the Puget Sound Loggers 3790 # and the Whitworth Pirates (D3). 3791 print( 3792 "Get the play-by-play data of a January 5th, 2019 " 3793 + "game between the Simpson Storm and " 3794 + "the Dubuque Spartans (D3)." 3795 ) 3796 df = get_basketball_raw_pbp(1625974) 3797 print(df) 3798 3799 ``` 3800 3801 Returns 3802 ---------- 3803 A pandas `DataFrame` object with a play-by-play (PBP) data in a given game. 3804 3805 """ 3806 load_from_cache = True 3807 is_overtime = False 3808 3809 sport_id = "" 3810 season = 0 3811 away_score = 0 3812 home_score = 0 3813 3814 mbb_teams_df = load_basketball_teams(get_wbb_data=False) 3815 mbb_team_ids_arr = mbb_teams_df["team_id"].to_list() 3816 3817 wbb_teams_df = load_basketball_teams(get_wbb_data=True) 3818 wbb_team_ids_arr = wbb_teams_df["team_id"].to_list() 3819 3820 pbp_df = pd.DataFrame() 3821 pbp_df_arr = [] 3822 temp_df = pd.DataFrame() 3823 3824 temp_df = pd.DataFrame() 3825 home_dir = expanduser("~") 3826 home_dir = _format_folder_str(home_dir) 3827 3828 stat_columns = [ 3829 "season", 3830 "game_id", 3831 "sport_id", 3832 "game_datetime", 3833 "half_num", 3834 "event_num", 3835 "game_time_str", 3836 "game_time_seconds", 3837 "game_time_milliseconds", 3838 "event_team", 3839 "event_text", 3840 "is_overtime", 3841 "stadium_name", 3842 "attendance", 3843 "away_team_id", 3844 "away_team_name", 3845 "home_team_id", 3846 "home_team_name", 3847 ] 3848 3849 url = f"https://stats.ncaa.org/contests/{game_id}/play_by_play" 3850 3851 if exists(f"{home_dir}/.ncaa_stats_py/"): 3852 pass 3853 else: 3854 mkdir(f"{home_dir}/.ncaa_stats_py/") 3855 3856 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/"): 3857 pass 3858 else: 3859 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/") 3860 3861 if exists(f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/"): 3862 pass 3863 else: 3864 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/") 3865 3866 if exists( 3867 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3868 + f"{game_id}_raw_pbp.csv" 3869 ): 3870 games_df = pd.read_csv( 3871 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3872 + f"{game_id}_raw_pbp.csv" 3873 ) 3874 games_df = games_df.infer_objects() 3875 file_mod_datetime = datetime.fromtimestamp( 3876 getmtime( 3877 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 3878 + f"{game_id}_raw_pbp.csv" 3879 ) 3880 ) 3881 load_from_cache = True 3882 else: 3883 file_mod_datetime = datetime.today() 3884 load_from_cache = False 3885 3886 if exists(f"{home_dir}/.ncaa_stats_py/"): 3887 pass 3888 else: 3889 mkdir(f"{home_dir}/.ncaa_stats_py/") 3890 3891 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/"): 3892 pass 3893 else: 3894 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/") 3895 3896 if exists(f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/"): 3897 pass 3898 else: 3899 mkdir(f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/") 3900 3901 if exists( 3902 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3903 + f"{game_id}_raw_pbp.csv" 3904 ): 3905 games_df = pd.read_csv( 3906 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3907 + f"{game_id}_raw_pbp.csv" 3908 ) 3909 games_df = games_df.infer_objects() 3910 file_mod_datetime = datetime.fromtimestamp( 3911 getmtime( 3912 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 3913 + f"{game_id}_raw_pbp.csv" 3914 ) 3915 ) 3916 load_from_cache = True 3917 else: 3918 logging.info("Could not find a WBB player game stats file") 3919 3920 now = datetime.today() 3921 3922 age = now - file_mod_datetime 3923 3924 if age.days >= 35: 3925 load_from_cache = False 3926 3927 if load_from_cache is True: 3928 return games_df 3929 3930 response = _get_webpage(url=url) 3931 soup = BeautifulSoup(response.text, features="lxml") 3932 3933 info_table = soup.find( 3934 "td", 3935 { 3936 "style": "padding: 0px 30px 0px 30px", 3937 "class": "d-none d-md-table-cell" 3938 } 3939 ).find( 3940 "table", 3941 {"style": "border-collapse: collapse"} 3942 ) 3943 3944 info_table_rows = info_table.find_all("tr") 3945 3946 game_date_str = info_table_rows[3].find("td").text 3947 if "TBA" in game_date_str: 3948 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBA') 3949 elif "tba" in game_date_str: 3950 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tba') 3951 elif "TBD" in game_date_str: 3952 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y TBD') 3953 elif "tbd" in game_date_str: 3954 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y tbd') 3955 elif ( 3956 "tbd" not in game_date_str.lower() and 3957 ":" not in game_date_str.lower() 3958 ): 3959 game_date_str = game_date_str.replace(" ", "") 3960 game_datetime = datetime.strptime(game_date_str, '%m/%d/%Y') 3961 else: 3962 game_datetime = datetime.strptime( 3963 game_date_str, 3964 '%m/%d/%Y %I:%M %p' 3965 ) 3966 game_datetime = game_datetime.astimezone(timezone("US/Eastern")) 3967 game_date_str = game_datetime.isoformat() 3968 del game_datetime 3969 3970 stadium_str = info_table_rows[4].find("td").text 3971 3972 attendance_str = info_table_rows[5].find("td").text 3973 attendance_int = re.findall( 3974 r"([0-9\,]+)", 3975 attendance_str 3976 )[0] 3977 attendance_int = attendance_int.replace(",", "") 3978 attendance_int = int(attendance_int) 3979 3980 del attendance_str 3981 team_cards = soup.find_all( 3982 "td", 3983 { 3984 "valign": "center", 3985 "class": "grey_text d-none d-sm-table-cell" 3986 } 3987 ) 3988 3989 away_url = team_cards[0].find_all("a") 3990 away_url = away_url[0] 3991 home_url = team_cards[1].find_all("a") 3992 home_url = home_url[0] 3993 3994 away_team_name = away_url.text 3995 home_team_name = home_url.text 3996 3997 away_team_id = away_url.get("href") 3998 home_team_id = home_url.get("href") 3999 4000 away_team_id = away_team_id.replace("/teams", "") 4001 away_team_id = away_team_id.replace("/team", "") 4002 away_team_id = away_team_id.replace("/", "") 4003 away_team_id = int(away_team_id) 4004 4005 home_team_id = home_team_id.replace("/teams", "") 4006 home_team_id = home_team_id.replace("/team", "") 4007 home_team_id = home_team_id.replace("/", "") 4008 home_team_id = int(home_team_id) 4009 4010 if home_team_id in mbb_team_ids_arr: 4011 sport_id = "MBB" 4012 temp_df = mbb_teams_df[mbb_teams_df["team_id"] == home_team_id] 4013 season = temp_df["season"].iloc[0] 4014 del temp_df 4015 elif home_team_id in wbb_team_ids_arr: 4016 sport_id = "WBB" 4017 temp_df = wbb_teams_df[wbb_teams_df["team_id"] == home_team_id] 4018 season = temp_df["season"].iloc[0] 4019 del temp_df 4020 # This should never be the case, 4021 # but if something goes very horribly wrong, 4022 # double check the away team ID to 4023 # the MBB and WBB team ID list. 4024 elif away_team_id in mbb_team_ids_arr: 4025 sport_id = "MBB" 4026 temp_df = mbb_teams_df[mbb_teams_df["team_id"] == away_team_id] 4027 season = temp_df["season"].iloc[0] 4028 del temp_df 4029 elif away_team_id in wbb_team_ids_arr: 4030 sport_id = "WBB" 4031 temp_df = wbb_teams_df[wbb_teams_df["team_id"] == home_team_id] 4032 season = temp_df["season"].iloc[0] 4033 del temp_df 4034 # If we get to this, we are in a code red situation. 4035 # "SHUT IT DOWN" - Gordon Ramsay 4036 else: 4037 raise ValueError( 4038 "Could not identify if this is a " + 4039 "MBB or WBB game based on team IDs. " 4040 ) 4041 4042 section_cards = soup.find_all( 4043 "div", 4044 {"class": "row justify-content-md-center w-100"} 4045 ) 4046 4047 for card in section_cards: 4048 # top_bot = "" 4049 event_text = "" 4050 half_str = card.find( 4051 "div", 4052 {"class": "card-header"} 4053 ).text 4054 half_num = re.findall( 4055 r"([0-9]+)", 4056 half_str 4057 ) 4058 4059 half_num = int(half_num[0]) 4060 if "ot" in half_str.lower(): 4061 is_overtime = True 4062 half_num += 2 4063 table_body = card.find("table").find("tbody").find_all("tr") 4064 4065 for row in table_body: 4066 t_cells = row.find_all("td") 4067 t_cells = [x.text.strip() for x in t_cells] 4068 game_time_str = t_cells[0] 4069 4070 if len(t_cells[1]) > 0: 4071 event_team = away_team_id 4072 event_text = t_cells[1] 4073 elif len(t_cells[3]) > 0: 4074 event_team = home_team_id 4075 event_text = t_cells[3] 4076 4077 if t_cells[1].lower() == "game start": 4078 pass 4079 elif t_cells[1].lower() == "jumpball startperiod": 4080 pass 4081 elif t_cells[1].lower() == "period start": 4082 pass 4083 elif t_cells[1].lower() == "period end confirmed;": 4084 pass 4085 elif t_cells[1].lower() == "period end confirmed": 4086 pass 4087 elif t_cells[1].lower() == "game end confirmed;": 4088 pass 4089 elif t_cells[1].lower() == "game end confirmed": 4090 pass 4091 elif t_cells[1].lower() == "timeout commercial": 4092 pass 4093 else: 4094 away_score, home_score = t_cells[2].split("-") 4095 4096 away_score = int(away_score) 4097 home_score = int(home_score) 4098 if len(game_time_str.split(":")) == 3: 4099 temp_time_minutes, temp_time_seconds, game_time_ms = \ 4100 game_time_str.split(":") 4101 elif len(game_time_str.split(":")) == 2: 4102 temp_time_minutes, temp_time_seconds = \ 4103 game_time_str.split(":") 4104 game_time_ms = 0 4105 4106 temp_time_minutes = int(temp_time_minutes) 4107 temp_time_seconds = int(temp_time_seconds) 4108 game_time_ms = int(game_time_ms) 4109 game_time_seconds = temp_time_seconds + (temp_time_minutes * 60) 4110 4111 if half_num == 1: 4112 half_seconds_remaining = game_time_seconds 4113 half_ms_remaining = game_time_ms 4114 4115 game_time_seconds += 1200 4116 else: 4117 half_seconds_remaining = game_time_seconds 4118 half_ms_remaining = game_time_ms 4119 4120 temp_df = pd.DataFrame( 4121 { 4122 # "season": season, 4123 # "game_id": game_id, 4124 # "sport_id": sport_id, 4125 # "away_team_id": away_team_id, 4126 # "away_team_name": away_team_name, 4127 # "home_team_id": home_team_id, 4128 # "home_team_name": home_team_name, 4129 "game_time_str": game_time_str, 4130 "half_seconds_remaining": half_seconds_remaining, 4131 "half_milliseconds_remaining": half_ms_remaining, 4132 "game_seconds_remaining": game_time_seconds, 4133 "game_milliseconds_remaining": game_time_ms, 4134 "half_num": half_num, 4135 "event_team": event_team, 4136 "event_text": event_text, 4137 "is_overtime": is_overtime 4138 }, 4139 index=[0], 4140 ) 4141 pbp_df_arr.append(temp_df) 4142 4143 pbp_df = pd.concat(pbp_df_arr, ignore_index=True) 4144 pbp_df["event_num"] = pbp_df.index + 1 4145 pbp_df["game_datetime"] = game_date_str 4146 pbp_df["season"] = season 4147 pbp_df["game_id"] = game_id 4148 pbp_df["sport_id"] = sport_id 4149 pbp_df["stadium_name"] = stadium_str 4150 pbp_df["attendance"] = attendance_int 4151 pbp_df["away_team_id"] = away_team_id 4152 pbp_df["away_team_name"] = away_team_name 4153 pbp_df["home_team_id"] = home_team_id 4154 pbp_df["home_team_name"] = home_team_name 4155 4156 pbp_df = pbp_df.reindex(columns=stat_columns) 4157 pbp_df = pbp_df.infer_objects() 4158 4159 if sport_id == "MBB": 4160 pbp_df.to_csv( 4161 f"{home_dir}/.ncaa_stats_py/basketball_MBB/raw_pbp/" 4162 + f"{game_id}_raw_pbp.csv", 4163 index=False 4164 ) 4165 elif sport_id == "WBB": 4166 pbp_df.to_csv( 4167 f"{home_dir}/.ncaa_stats_py/basketball_WBB/raw_pbp/" 4168 + f"{game_id}_raw_pbp.csv", 4169 index=False 4170 ) 4171 else: 4172 raise ValueError( 4173 f"Improper Sport ID: `{sport_id}`" 4174 ) 4175 4176 return pbp_df
Given a valid game ID, this function will attempt to get the raw play-by-play (PBP) data for that game.
Parameters
game_id
(int, mandatory):
Required argument.
Specifies the game you want play-by-play data (PBP) from.
Usage
from ncaa_stats_py.basketball import get_basketball_raw_pbp
########################################
# Men's Basketball #
########################################
# Get the play-by-play data of the
# 2024 NCAA D1 Men's Basketball National Championship game.
print(
"Get the play-by-play data of the "
+ "2024 NCAA D1 Men's Basketball National Championship game."
)
df = get_basketball_raw_pbp(5254137)
print(df)
# Get the play-by-play data of a March Madness game on March 29th, 2024
# between Duke and the Houston Cougars.
print(
"Get the play-by-play data "
+ "of a March Madness game on March 29th, 2024 "
+ "between Duke and the Houston Cougars."
)
df = get_basketball_raw_pbp(5254126)
print(df)
# Get the play-by-play data of a February 28th
# game between the Winthrop Eagles and High Point Panthers.
print(
"Get the play-by-play data of a February 28th "
+ "game between the Winthrop Eagles and High Point Panthers."
)
df = get_basketball_raw_pbp(3969302)
print(df)
# Get the play-by-play data of a December 19th, 2022
# game between the San Francisco St. Gators and
# the Cal St. Monterey Bay Otters (D2).
print(
"Get the play-by-play data of a December 19th, 2022 "
+ "game between the San Francisco St. Gators and " +
"the Cal St. Monterey Bay Otters (D2)."
)
df = get_basketball_raw_pbp(2341500)
print(df)
# Get the play-by-play data of a January 3rd, 2022
# game between the Hamline Pipers and the St. Olaf Oles (D3).
print(
"Get the play-by-play data of a January 3rd, 2022 "
+ "game between the Hamline Pipers and the St. Olaf Oles (D3)."
)
df = get_basketball_raw_pbp(3967963)
print(df)
########################################
# Women's Basketball #
########################################
# Get the play-by-play data of the
# 2024 NCAA D1 Women's Basketball National Championship game.
print(
"Get the play-by-play data of the "
+ "2024 NCAA D1 Women's Basketball National Championship game."
)
df = get_basketball_raw_pbp(5254137)
print(df)
# Get the play-by-play data of a March 12th, 2021
# game between the La Salle Explorers and the Dayton Flyers.
print(
"Get the play-by-play data of a March 12th, 2021 "
+ "game between the La Salle Explorers and the Dayton Flyers."
)
df = get_basketball_raw_pbp(2055636)
print(df)
# Get the play-by-play data of a February 6th, 2020
# game between Purdue Northwest and the Michigan Tech Huskies (D2).
print(
"Get the play-by-play data of a Thanksgiving Day "
+ "game between the Sacred Heart Pioneers and "
+ "the P.R.-Mayaguez Janes (D2)."
)
df = get_basketball_raw_pbp(1793405)
print(df)
# Get the play-by-play data of a January 5th, 2019
# game between the Puget Sound Loggers
# and the Whitworth Pirates (D3).
print(
"Get the play-by-play data of a January 5th, 2019 "
+ "game between the Simpson Storm and "
+ "the Dubuque Spartans (D3)."
)
df = get_basketball_raw_pbp(1625974)
print(df)
Returns
A pandas DataFrame
object with a play-by-play (PBP) data in a given game.
4179def get_basketball_game_starters(game_id: int) -> list: 4180 """ 4181 Given a valid game ID, this function will attempt to 4182 get the starting lineup out of the raw play-by-play data 4183 from the game. 4184 4185 NOTE #1: The layout of the list will be as follows: 4186 4187 > | Index | **Away players** | 4188 > | :---: | :------------------: | 4189 > | 0 | Away team starter #1 | 4190 > | 1 | Away team starter #2 | 4191 > | 2 | Away team starter #3 | 4192 > | 3 | Away team starter #4 | 4193 > | 4 | Away team starter #5 | 4194 4195 > | Index | **Home players** | 4196 > | :---: | :------------------: | 4197 > | 5 | Home team starter #1 | 4198 > | 6 | Home team starter #2 | 4199 > | 7 | Home team starter #3 | 4200 > | 8 | Home team starter #4 | 4201 > | 9 | Home team starter #5 | 4202 4203 NOTE #2: Starters are listed in order of when they first sub out. 4204 Do not assume that starter #5 for a team is a center, 4205 or that starter #1 is a PG! 4206 4207 Returns 4208 ---------- 4209 A list of starters from a specific basketball game ID. 4210 4211 """ 4212 starters_list = [] 4213 pbp_df = get_basketball_raw_pbp(game_id=game_id) 4214 away_team_id = pbp_df["away_team_id"].iloc[0] 4215 home_team_id = pbp_df["home_team_id"].iloc[0] 4216 # pointer_int = 0 4217 4218 for team_id in [away_team_id, home_team_id]: 4219 temp_starters_list = [] 4220 4221 temp_df = pbp_df[pbp_df["event_team"] == team_id] 4222 4223 play_text_list = temp_df["event_text"].to_list() 4224 4225 for play_txt in play_text_list: 4226 if len(temp_starters_list) == 5: 4227 break 4228 elif "substitution out" in play_txt: 4229 player_txt = play_txt.split(",")[0] 4230 if play_txt in temp_starters_list: 4231 pass 4232 elif player_txt.lower() == "team": 4233 pass 4234 elif (player_txt is None) or (len(player_txt) == 0): 4235 raise ValueError( 4236 "Player cannot be NULL." 4237 ) 4238 else: 4239 temp_starters_list.append(player_txt) 4240 4241 if len(temp_starters_list) < 5: 4242 raise ValueError( 4243 f"Could not find all 5 starters for team ID {team_id} " + 4244 f"in game ID {game_id}" 4245 ) 4246 for txt in temp_starters_list: 4247 starters_list.append(txt) 4248 return starters_list
Given a valid game ID, this function will attempt to get the starting lineup out of the raw play-by-play data from the game.
NOTE #1: The layout of the list will be as follows:
Index Away players 0 Away team starter #1 1 Away team starter #2 2 Away team starter #3 3 Away team starter #4 4 Away team starter #5
Index Home players 5 Home team starter #1 6 Home team starter #2 7 Home team starter #3 8 Home team starter #4 9 Home team starter #5
NOTE #2: Starters are listed in order of when they first sub out. Do not assume that starter #5 for a team is a center, or that starter #1 is a PG!
Returns
A list of starters from a specific basketball game ID.