【競馬AI 13】作成したモデルを使って2023年の回収率を計算する

競馬AI

2024.02.05

競馬AI⑤で作成した回収率計算の更新版になります。

この前の記事で穴馬モデルも作成したので、上位予測馬と穴馬の組み合わせでどのくらいの回収率になるのか、2023年のデータを使って確認してみます。

今回は馬連とワイドの回収率が計算できるように修正しました。

※モデルは私の方でカスタマイズしたものになりますので、同じ結果にはなりません

回収率を計算するコード
コードの解説
1. テストデータ分割
2. 予測する馬の選択
実行結果
まとめ

回収率を計算するコード

モデル名やファイル名などは自分の環境に合わせて修正してください。

51行目のdrop_arrは絶対にモデルによって異なるので、必ず自身のコードと合わせて修正してください！

import lightgbm as lgb
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
import ast
from sklearn import metrics

# 割合に応じでデータを分割
def split_date(df, test_size):
    sorted_id_list = df.sort_values('race_id').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list) * (1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1-test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

# 2023年を境にテストデータに分割
def split_date_by_year(df):
    # race_idの先頭4桁を取得し、整数型に変換して年として扱う
    df['year'] = df['race_id'].astype(str).str[:4].astype(int)
    
    # 2023年のデータをテストセットとして選択
    test = df[df['year'] == 2023]
    # print(len(test['race_id'].unique()))
    # 2023年以外のデータをトレーニングセットとして選択
    train = df[df['year'] != 2023]
    
    # 年を表す一時的な列を削除
    train = train.drop(columns=['year'])
    test = test.drop(columns=['year'])
    
    return train, test

# 馬番が含まれる文字列（例: "1-3"）を分割して整数リストに変換する関数
def parse_horse_numbers(horse_number_str):
    return [int(num) for num in horse_number_str.split('-')]

# モデルファイル
model_file = 'model/model.txt'
model_file2 = 'model/model_ana.txt'

# データの読み込み
data = pd.read_csv('encoded/encoded_data.csv')
data['着順'] = data['着順'].map(lambda x: 1 if x == 1 else 0)

# 特徴量とターゲットの分割
# train, test = split_date(data, 0.2)
# 特徴量とターゲットを2023年で分割
train, test = split_date_by_year(data)

race_ids = test['race_id']
drop_arr=[
'race_id','着順','上がり','走破時間','通過順','人気','オッズ','騎手','レース名','開催','馬の平均着順', '馬の3着内率','最終追切_評価']
X_test = test.drop(drop_arr, axis=1)
# 既に訓練したモデル 'model' を読み込む
model = lgb.Booster(model_file=model_file)
model2 = lgb.Booster(model_file=model_file2)

# テストデータに対する予測確率を求める
y_pred = model.predict(X_test)
y_pred2 = model2.predict(X_test)

X_test.insert(0, 'race_id', race_ids)
X_test.insert(1, '予測結果', y_pred)
X_test.insert(2, '予測結果2', y_pred2)
X_test.insert(3, '着順', test['着順'])

#レース数
race_count = X_test['race_id'].nunique()

# 各 race_id ごとに最大の予測値を持つ行を抽出
max_predictions_per_race = X_test.groupby('race_id')['予測結果'].idxmax()
X_test_new = X_test.loc[max_predictions_per_race]

# 年度のリストを生成
years = range(2005, 2024)
# 各年度のCSVファイルを読み込み、一つのデータフレームに結合
df = []
for year in years:
    path = f"payback/{year}.csv"
    data = pd.read_csv(path, encoding="SHIFT-JIS", header=None)
    df.append(data)

betting_data = pd.concat(df, ignore_index=True)
betting_data.set_index(betting_data.iloc[:, 0].astype(str).str.strip(), inplace=True)

# 単勝と複勝の回収金額を計算
win_return_amount = 0  # 単勝の回収金額
place_return_amount = 0  # 複勝の回収金額
betting_count = 0  # 賭けた回数
umaren_return_amount = 0 # 馬連の回収金額
wide_return_amount = 0 # 馬連の回収金額
total_betting_amount = 0  # 賭けた合計金額
total_betting_amount_umaren = 0  # 賭けた合計金額
total_betting_amount_wide = 0  # 賭けた合計金額
umaren_hit_count = 0
wide_hit_count = 0

#閾値（この予測値を超えた馬を買う基準）
threshold = 0.85
# 予測値が0.85以上の行のインデックスを取得
high_pred_indices = [i for i, (pred, cls) in enumerate(zip(X_test_new['予測結果'], X_test_new['クラス'])) if pred >= threshold and cls != 3]
for index in high_pred_indices:
    # X_testの該当行にアクセス
    row = X_test_new.iloc[index]
    race_id = str(int(float(row['race_id'])))

    horse_number = str(int(float(row['馬番'])))  # 馬番を文字列に変換
    race = int(race_id[-2:])

    # race_idでX_testにアクセスし、予測結果2の最大値が-0.4以上の行を取得
    race_rows = X_test[X_test['race_id'] == row['race_id']]  # race_idに対応する行を取得
    race_rows_max_pred2 = race_rows[race_rows['予測結果2'] >= -0.3]  # 予測結果2の最大値が-0.3以上の行をフィルタリング
    horse_number2 = "99"
    if not race_rows_max_pred2.empty:
        row2 = race_rows_max_pred2.loc[race_rows_max_pred2['予測結果2'].idxmax()]  # 予測結果2が最大の行を取得
        if horse_number != str(int(float(row2['馬番']))):
            total_betting_amount_umaren += bet
            total_betting_amount_wide += bet
            horse_number2 = str(int(float(row2['馬番'])))
        
    bet = 100

    betting_count += 1
    total_betting_amount += bet

    if race_id in betting_data.index:
        race_data = betting_data.loc[race_id]  # 対応するレースのデータを取得
        race_data_list = ast.literal_eval(race_data[1])
        win_data = race_data_list[0]  # 単勝のデータを取得
        place_data = race_data_list[1]  # 複勝のデータを取得
        umaren_data = race_data_list[2]  # 馬連のデータを取得
        wide_data = race_data_list[3]  # ワイドのデータを取得

        for j in range(0, len(win_data), 2):
            if win_data[j] == horse_number:  # 賭けた馬が単勝した場合
                win_return_amount += int(win_data[j + 1].replace(',', '')) * bet / 100  # 回収金額を加算

        for j in range(0, len(place_data), 2):
            if place_data[j] == horse_number:  # 賭けた馬が複勝した場合
                place_return_amount += int(place_data[j + 1].replace(',', '')) * bet / 100  # 回収金額を加算
        
        for j in range(0, len(umaren_data), 2):
            if all(hn in parse_horse_numbers(umaren_data[j]) for hn in [int(horse_number), int(horse_number2)]):
                umaren_hit_count += 1
                umaren_return_amount += int(umaren_data[j + 1].replace(',', '')) * bet / 100  # 回収金額を加算

        for j in range(0, len(wide_data), 2):
            if all(hn in parse_horse_numbers(wide_data[j]) for hn in [int(horse_number), int(horse_number2)]):
                wide_hit_count += 1
                wide_return_amount += int(wide_data[j + 1].replace(',', '')) * bet / 100  # 回収金額を加算
    else:
        print(f"Race ID {race_id} not found in betting data.")

# 単勝と複勝の回収率を計算
win_return_rate = win_return_amount / total_betting_amount *100 # 単勝の回収率
place_return_rate = place_return_amount / total_betting_amount *100 # 複勝の回収率
umaren_return_rate = umaren_return_amount / total_betting_amount_umaren *100 # 馬連の回収率
wide_return_rate = wide_return_amount / total_betting_amount_wide *100 # 馬連の回収率

TP = (X_test_new['着順'] == 1) & (X_test_new['予測結果'] >= threshold) & (X_test_new['クラス'] != 3)# True positives
FP = (X_test_new['着順'] == 0) & (X_test_new['予測結果'] >= threshold) & (X_test_new['クラス'] != 3) # False positives

TP_count = sum(TP)
FP_count = sum(FP)

accuracy_TP = TP_count / betting_count * 100
misclassification_rate_FP = FP_count / betting_count * 100

print("Race count:", race_count)
print("Betting cases:", betting_count)
print("True positives:", TP_count, "(", "{:.2f}".format(accuracy_TP), "%)")
print("False positives:", FP_count, "(", "{:.2f}".format(misclassification_rate_FP), "%)")

print("単勝回収率:", round(win_return_rate,3))
print("複勝回収率:", round(place_return_rate,3))
print("馬連的中率:", str(umaren_hit_count) + '/' + str(int(total_betting_amount_umaren / 100)) + '(' + str(round(umaren_hit_count / int(total_betting_amount_umaren / 100),3) * 100) + '%)')
print("馬連回収率:", round(umaren_return_rate,3))
print("ワイド的中率:", str(wide_hit_count) + '/' + str(int(total_betting_amount_wide / 100)) + '(' + str(round(wide_hit_count / int(total_betting_amount_wide / 100),3) * 100) + '%)')
print("ワイド回収率:", round(wide_return_rate,3))

コードの解説

テストデータ分割

コード上部にある2つの関数はそれぞれレースデータを学習に使う分とテストに使う分に分ける関数になっています。

今回は2023年だけでテストがしたかったので、”split_date_by_year”関数を使っていきます。必要に応じて使う関数は変更してください。
※46,48行目のコメントアウトを付け替える

# 割合に応じでデータを分割
def split_date(df, test_size):
    sorted_id_list = df.sort_values('race_id').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list) * (1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1-test_size)):]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

# 2023年を境にテストデータに分割
def split_date_by_year(df):
    # race_idの先頭4桁を取得し、整数型に変換して年として扱う
    df['year'] = df['race_id'].astype(str).str[:4].astype(int)
    
    # 2023年のデータをテストセットとして選択
    test = df[df['year'] == 2023]
    # print(len(test['race_id'].unique()))
    # 2023年以外のデータをトレーニングセットとして選択
    train = df[df['year'] != 2023]
    
    # 年を表す一時的な列を削除
    train = train.drop(columns=['year'])
    test = test.drop(columns=['year'])
    
    return train, test

予測する馬の選択

軸となる馬の選択はメインのモデルの予測値が0.85を超えた馬にしています。各レースに2頭以上0.85を超えた馬がいた場合、一番高い馬を選択しています。

99行目で閾値を設定しています。自身のモデルで最も回収率が良い値を設定してください。

#閾値（この予測値を超えた馬を買う基準）
threshold = 0.85

※閾値を高くすればするほど回収率は上がりますが、掛けるレース数が減ってしまうので注意してください

穴馬の選択は、予測値が-0.3を超えた馬にしています。損失関数を使って穴馬予想モデルを作ったら全体的に予測値がマイナス域になってしまいました。
このあたりの閾値もご自身で探ってみてください。

※112行目

race_rows_max_pred2 = race_rows[race_rows['予測結果2'] >= -0.3]  # 予測結果2の最大値が-0.3以上の行をフィルタリング

実行結果

実行結果は次のようになりました。

私の環境では2023年の”芝”のレースだけ、かつ新馬戦を除いたレースを対象に回収率を出しています。

Race count: 1614
Betting cases: 917
True positives: 538 ( 58.67 %)
False positives: 379 ( 41.33 %)
単勝回収率: 239.422
複勝回収率: 135.344
馬連的中率: 19/100(19.0%)
馬連回収率: 1341.6
ワイド的中率: 40/100(40.0%)
ワイド回収率: 1136.6

見方としては、1614レース中、0.85の閾値を超えた馬が存在する917レースに掛けました。
その場合、単勝が的中したのが538レース、外れたのが379レース、単勝回収率が239%となります。

また、0.85の閾値を超えた馬が存在し、かつ穴馬も-0.3の閾値を超えたレースが100レースあり、馬連の的中率が19%で回収率が1342%、ワイドの的中率が40%で回収率が1137%となります。

まとめ

今回自分が作成したモデルの精度を2023年のデータで試してみたわけですが、回収率が驚異の1000%越え！

的中回数は年間で考えるとかなり少なくなりますが、2023年はこれだけ買っていれば大勝出来ていたことになります。

自分自身「本当か？」と疑いたくなるような結果なので、もしコードに誤りなどあれば指摘してください。

当面は信じて買ってみます。

ダートの方の回収率も下に載せておきます。ご自身のモデルの精度を試してみてください！

Race count: 1574
Betting cases: 991
True positives: 613 ( 61.86 %)
False positives: 378 ( 38.14 %)
単勝回収率: 343.845
複勝回収率: 162.23
馬連的中率: 34/164(20.7%)
馬連回収率: 1955.427
ワイド的中率: 72/164(43.9%)
ワイド回収率: 914.268

NS より:
2024年3月10日 1:58 PM
コメント失礼いたします。
本サイトを参考に予想AIを作成し、学ばさせていただいております。
その中で、以下の個所について質問させてください。
>>51行目のdrop_arrは絶対にモデルによって異なるので、必ず自身のコードと合わせて修正してください！
とありますが、この箇所のモデルは、自分で作成したmodel.txtの
feature_names=race_id 馬騎手馬番体重体重変化性齢斤量レース名日付開催クラス芝・ダート距離回り馬場天気場id 場名日付1 馬番1 騎手1 斤量1 オッズ1 体重1 体重変化1 上がり1 通過順1 着順1 距離1 クラス1 走破時間1 芝・ダート1 天気1 馬場1 日付2 馬番2 騎手2 斤量2 オッズ2 体重2 体重変化2 上がり2 通過順2 着順2 距離2 クラス2 走破時間2 芝・ダート2 天気2 馬場2 日付3 馬番3 騎手3 斤量3 オッズ3 体重3 体重変化3 上がり3 通過順3 着順3 距離3 クラス3 走破時間3 芝・ダート3 天気3 馬場3 日付4 馬番4 騎手4 斤量4 オッズ4 体重4 体重変化4 上がり4 通過順4 着順4 距離4 クラス4 走破時間4 芝・ダート4 天気4 馬場4 日付5 馬番5 騎手5 斤量5 オッズ5 体重5 体重変化5 上がり5 通過順5 着順5 距離5 クラス5 走破時間5 芝・ダート5 天気5 馬場5 距離差日付差距離差1 日付差1 距離差2 日付差2 距離差3 日付差3 距離差4 日付差4 平均斤量騎手の勝率
ここの個所をさしているのでしょうか？
自分のモデルに合わせるの意味があまり分かっておらず、、
お手数をおかけしますが、ご回答いただければ幸いです。
以下実行時エラーコード
[LightGBM] [Fatal] The number of features in data (102) is not the same as it was in training data (112).
You can set “predict_disable_shape_check=true“ to discard this error, but please be aware what you are doing.
Traceback (most recent call last):
File “C:\Users\user\Desktop\data\calc_return.py”, line 60, in
y_pred = model.predict(X_test)
^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\basic.py”, line 4453, in predict
return predictor.predict(
^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\basic.py”, line 1159, in predict
preds, nrow = self.__pred_for_np2d(
^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\basic.py”, line 1306, in __pred_for_np2d
return self.__inner_predict_np2d(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\basic.py”, line 1259, in __inner_predict_np2d
_safe_call(_LIB.LGBM_BoosterPredictForMat(
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightgbm\basic.py”, line 263, in _safe_call
raise LightGBMError(_LIB.LGBM_GetLastError().decode(‘utf-8’))
lightgbm.basic.LightGBMError: The number of features in data (102) is not the same as it was in training data (112).
You can set “predict_disable_shape_check=true“ to discard this error, but please be aware what you are doing.
返信
- agus より:
  2024年3月11日 7:44 AM
  model.txtは直接修正しません。
  コードの以下の部分はモデルに悪影響を及ぼす特徴量を指定し、除外しています。
  モデルによって除外する特徴量は違うので、影響度合いを確認して修正が必要です。
  drop_arr=[
  ‘race_id’,’着順’,’上がり’,’走破時間’,’通過順’,’人気’,’オッズ’,’騎手’,’レース名’,’開催’,’馬の平均着順’, ‘馬の3着内率’,’最終追切_評価’]
  返信
oshm より:
2024年4月15日 10:14 PM
いつも楽しく拝見しています。
特にスクレイピングのコードは大変参考になりました、ありがとうございます。
本記事にある回収率については、モデルで計算された数値に収束していくまでにある程度のレース数をこなす必要があるように思います。
釈迦に説法のようですが、横軸にレース数/縦軸に回収率(cumsum)をとった推移グラフを描画すると、いつごろ最終的な回収率に収束しそうかの目安になります。
弊方も自作のモデルで確認したところ、最終的な回収率が65%程度なのですが、
そこに到達するまでにレース数としては200レースほどを要しました。
コードを参考にさせていただいたお礼とコメントを兼ねて、失礼いたしました。
返信
HIRO より:
2024年5月18日 11:15 AM
お忙しいところ申し訳ございません。
下記のエラーが発生しました。
C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\dask\dataframe\__init__.py:31: FutureWarning:
Dask dataframe query planning is disabled because dask-expr is not installed.
You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.
warnings.warn(msg, FutureWarning)
Traceback (most recent call last):
File “C:\Users\user\Desktop\Keiba\calc_return.py”, line 53, in
X_test = test.drop(drop_arr, axis=1)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\frame.py”, line 5581, in drop
return super().drop(
^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py”, line 4788, in drop
obj = obj._drop_axis(labels, axis, level=level, errors=errors)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py”, line 4830, in _drop_axis
new_axis = axis.drop(labels, errors=errors)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “C:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py”, line 7070, in drop
raise KeyError(f”{labels[mask].tolist()} not found in axis”)
KeyError: “[‘馬の平均着順’, ‘馬の3着内率’, ‘最終追切_評価’] not found in axis”
>>51行目のdrop_arrは絶対にモデルによって異なるが原因か？
教えて下さい。宜しくお願い致します。
返信
- agus より:
  2024年5月19日 9:08 AM
  [‘馬の平均着順’, ‘馬の3着内率’, ‘最終追切_評価’]
  この特徴量はデータに含まれていますか？
  そもそも含まれていないのであれば、書く必要はありません。
  おっしゃる通り、drop_arrを自分のデータと見比べてください。
  返信
TT より:
2024年7月27日 10:34 PM
お忙しい所失礼します。
drop_arrですが、model.txtにあって、encode.txtない特徴量を除外するという認識であっていますでしょうか？
昨年の宝塚で検証した結果、予想ファイルと出力されてる閾値が違うので原因など分からないでしょうか？
また、閾値が一定以上の馬が複数いる場合は閾値が最も高い馬に賭けるようになっているのでしょうか？
複数の質問で恐縮ですが、お教えください。
返信
MS より:
2024年8月24日 4:49 PM
いつも大変参考にさせていただいております。
1点確認したい挙動があり、質問させてください。calc_return.pyとpredict.pyの予測値の違いについてです。
対象のレースのみをpickupして確認しようと思い、レースID「202410030809」で予測を行いました。
calc_return.pyの結果
予測結果：0.882826
馬番：14
predict.pyの結果
予測結果：0.8035749687797438
馬番：14
上記の通り、それぞれの結果で予測結果が異なる状態となりました。
※同じmodelファイルを参照していることは確認済みです。
※calc_return.pyはブレークポイントをおいてprintして確認
※predict.pyはrace_table_scraping.pyで取得、実行後のresultファイルで確認
カテゴリカル変数のエンコーディングの影響（値としては別になるので）かと思い、共に特徴量から除外してみたのですが、それでも結果が一致することはありませんでした。
（ここの結果が異なると、予測結果の閾値を8.5以上を賭けるとした場合にはpredict.pyだと閾値以下になるため対象レースとならず、
回収率が合わなくなるのでは？と思っています。。）
agus様の方では、
・calc_return.pyとpredict.pyで同じ予測結果が得られますでしょうか？
・予測結果が異なる原因として考えられるものはございますでしょうか？
勉強不足で恐縮ですが、ご見解いただけますと幸いです。
返信
- agus より:
  2024年8月31日 11:42 PM
  私の方では厳密に予測結果の比較まで行っていないのでわかりません。。。
  考えられる原因としては以下のようなものではないでしょうか？
  ・calc_return.pyとpredict.pyで行っている前処理が異なる
  ・使用したファイルの項目の順番が異なる
  あまりお役に立てずすみません。
  返信