import polars as pl
import polars.selectors as cs
pl.Config.set_tbl_cols(100)
pl.Config.set_tbl_width_chars(200)

import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline
import japanize_matplotlib

import seaborn as sns

import plotly
plotly.offline.init_notebook_mode()  # github pages 対応
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from datetime import date
import matplotlib.dates as mdates

import jpholiday

from prophet import Prophet

train = pl.read_csv("../data/input/train.csv")
test = pl.read_csv("../data/input/test.csv")
sample_submit = pl.read_csv("../data/output/sample_submit.csv", has_header=False)

print("train\n", train)
print("test\n", test)
print("sample_submit\n", sample_submit)

train
 shape: (2_101, 6)
┌────────────┬─────┬────────┬───────┬──────────┬──────────┐
│ datetime   ┆ y   ┆ client ┆ close ┆ price_am ┆ price_pm │
│ ---        ┆ --- ┆ ---    ┆ ---   ┆ ---      ┆ ---      │
│ str        ┆ i64 ┆ i64    ┆ i64   ┆ i64      ┆ i64      │
╞════════════╪═════╪════════╪═══════╪══════════╪══════════╡
│ 2010-07-01 ┆ 17  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-02 ┆ 18  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-03 ┆ 20  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-04 ┆ 20  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-05 ┆ 14  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ …          ┆ …   ┆ …      ┆ …     ┆ …        ┆ …        │
│ 2016-03-27 ┆ 94  ┆ 1      ┆ 0     ┆ 5        ┆ 4        │
│ 2016-03-28 ┆ 86  ┆ 1      ┆ 0     ┆ 4        ┆ 4        │
│ 2016-03-29 ┆ 98  ┆ 1      ┆ 0     ┆ 4        ┆ 4        │
│ 2016-03-30 ┆ 99  ┆ 1      ┆ 0     ┆ 5        ┆ 4        │
│ 2016-03-31 ┆ 105 ┆ 1      ┆ 0     ┆ 5        ┆ 4        │
└────────────┴─────┴────────┴───────┴──────────┴──────────┘
test
 shape: (365, 5)
┌────────────┬────────┬───────┬──────────┬──────────┐
│ datetime   ┆ client ┆ close ┆ price_am ┆ price_pm │
│ ---        ┆ ---    ┆ ---   ┆ ---      ┆ ---      │
│ str        ┆ i64    ┆ i64   ┆ i64      ┆ i64      │
╞════════════╪════════╪═══════╪══════════╪══════════╡
│ 2016-04-01 ┆ 1      ┆ 0     ┆ 3        ┆ 2        │
│ 2016-04-02 ┆ 0      ┆ 0     ┆ 5        ┆ 5        │
│ 2016-04-03 ┆ 1      ┆ 0     ┆ 2        ┆ 2        │
│ 2016-04-04 ┆ 1      ┆ 0     ┆ 1        ┆ 1        │
│ 2016-04-05 ┆ 0      ┆ 0     ┆ 1        ┆ 1        │
│ …          ┆ …      ┆ …     ┆ …        ┆ …        │
│ 2017-03-27 ┆ 0      ┆ 0     ┆ 5        ┆ 4        │
│ 2017-03-28 ┆ 0      ┆ 0     ┆ 4        ┆ 4        │
│ 2017-03-29 ┆ 0      ┆ 0     ┆ 3        ┆ 3        │
│ 2017-03-30 ┆ 0      ┆ 0     ┆ 4        ┆ 3        │
│ 2017-03-31 ┆ 1      ┆ 0     ┆ 5        ┆ 4        │
└────────────┴────────┴───────┴──────────┴──────────┘
sample_submit
 shape: (365, 2)
┌────────────┬───────────┐
│ column_1   ┆ column_2  │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ 2016-04-01 ┆ 65.989206 │
│ 2016-04-02 ┆ 73.07633  │
│ 2016-04-03 ┆ 62.837857 │
│ 2016-04-04 ┆ 57.607645 │
│ 2016-04-05 ┆ 56.316084 │
│ …          ┆ …         │
│ 2017-03-27 ┆ 95.172858 │
│ 2017-03-28 ┆ 95.172858 │
│ 2017-03-29 ┆ 95.172858 │
│ 2017-03-30 ┆ 95.172858 │
│ 2017-03-31 ┆ 95.172858 │
└────────────┴───────────┘

# IDとして使用されているdatetimeカラムを文字列型と日付型に分離
# カラムの役割を分散することでコードエラーを発生しにくくする
train = train.insert_column(0, train["datetime"].alias("id")).with_columns(
    pl.col("datetime").str.strptime(dtype=pl.Date)
)
test = test.insert_column(0, test["datetime"].alias("id")).with_columns(
    pl.col("datetime").str.strptime(dtype=pl.Date)
)

print(train.head())

shape: (5, 7)
┌────────────┬────────────┬─────┬────────┬───────┬──────────┬──────────┐
│ id         ┆ datetime   ┆ y   ┆ client ┆ close ┆ price_am ┆ price_pm │
│ ---        ┆ ---        ┆ --- ┆ ---    ┆ ---   ┆ ---      ┆ ---      │
│ str        ┆ date       ┆ i64 ┆ i64    ┆ i64   ┆ i64      ┆ i64      │
╞════════════╪════════════╪═════╪════════╪═══════╪══════════╪══════════╡
│ 2010-07-01 ┆ 2010-07-01 ┆ 17  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-02 ┆ 2010-07-02 ┆ 18  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-03 ┆ 2010-07-03 ┆ 20  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-04 ┆ 2010-07-04 ┆ 20  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
│ 2010-07-05 ┆ 2010-07-05 ┆ 14  ┆ 0      ┆ 0     ┆ -1       ┆ -1       │
└────────────┴────────────┴─────┴────────┴───────┴──────────┴──────────┘

# 訓練データの散布図行列
g = sns.pairplot(
    train.select(cs.exclude("id")).to_pandas(),
    plot_kws={"alpha": 0.5, "s": 10},
    diag_kws={"alpha": 0.7, "bins": 10},
)
g.figure.set_size_inches(8, 8)
g.figure.suptitle("特徴量間のペアプロット", y=1.02, fontsize=14)
plt.show()

# 歪度と尖度を計算
skew = stats.skew(train["y"])
kurt = stats.kurtosis(train["y"])

# 可視化
fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[2, 1],
    figsize=(12, 4),
    constrained_layout=True,
)
fig.suptitle("引っ越し数yの推移と分布")

axes[0].set_title("推移")
sns.lineplot(data=train, x="datetime", y="y", ax=axes[0])

axes[1].set_title("分布")
sns.histplot(data=train, x="y", ax=axes[1])
axes[1].text(
    0.6,
    0.8,
    f"skewness={skew:.2f}\nkurtosis={kurt:.2f}",
    fontsize=12,
    transform=axes[1].transAxes,
)

plt.show()
del skew, kurt

# 繁忙期を抜き出して見てみる
train = train.with_columns(
    (
        ((pl.col("datetime").dt.month() == 3) | (pl.col("datetime").dt.month() == 4))
    ).alias("is_busy")
)

fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[2, 1],
    figsize=(12, 4),
    constrained_layout=True,
)
fig.suptitle("引っ越し数yの推移と分布（繁忙期・非繁忙期）")

axes[0].set_title("推移")
sns.lineplot(data=train, x="datetime", y="y", hue="is_busy", ax=axes[0])

axes[1].set_title("分布")
sns.histplot(data=train, x="y", hue="is_busy", ax=axes[1])

plt.show()

# TODO: ここらへんで一度、ydata-profillingでレポートも出力させておく？ライブラリのインストールが必要。ipywigetsも必要。
# import pandas as pd
# from ydata_profiling import ProfileReport
# data_train = pd.read_csv('titanic_train.csv')
# profile = ProfileReport(data_train, title="Profiling Report")
# profile.to_widgets()

# 年ごとで分けるか？年度ごとで分けるか？
train = train.with_columns(
    [
        pl.col("datetime").dt.year().alias("year"),
        pl.when(pl.col("datetime").dt.month() <= 3)
        .then(pl.col("datetime").dt.year() - 1)
        .otherwise(pl.col("datetime").dt.year())
        .alias("fy"),
    ]
)
print(train.head(3))

shape: (3, 10)
┌────────────┬────────────┬─────┬────────┬───────┬──────────┬──────────┬─────────┬──────┬──────┐
│ id         ┆ datetime   ┆ y   ┆ client ┆ close ┆ price_am ┆ price_pm ┆ is_busy ┆ year ┆ fy   │
│ ---        ┆ ---        ┆ --- ┆ ---    ┆ ---   ┆ ---      ┆ ---      ┆ ---     ┆ ---  ┆ ---  │
│ str        ┆ date       ┆ i64 ┆ i64    ┆ i64   ┆ i64      ┆ i64      ┆ bool    ┆ i32  ┆ i32  │
╞════════════╪════════════╪═════╪════════╪═══════╪══════════╪══════════╪═════════╪══════╪══════╡
│ 2010-07-01 ┆ 2010-07-01 ┆ 17  ┆ 0      ┆ 0     ┆ -1       ┆ -1       ┆ false   ┆ 2010 ┆ 2010 │
│ 2010-07-02 ┆ 2010-07-02 ┆ 18  ┆ 0      ┆ 0     ┆ -1       ┆ -1       ┆ false   ┆ 2010 ┆ 2010 │
│ 2010-07-03 ┆ 2010-07-03 ┆ 20  ┆ 0      ┆ 0     ┆ -1       ┆ -1       ┆ false   ┆ 2010 ┆ 2010 │
└────────────┴────────────┴─────┴────────┴───────┴──────────┴──────────┴─────────┴──────┴──────┘

fig, axes = plt.subplots(
    nrows=2,
    ncols=1,
    height_ratios=[1, 1],
    width_ratios=[1],
    figsize=(8, 6),
    constrained_layout=True,
)
fig.suptitle("引っ越し数yの推移　年・年度別での比較")

axes[0].set_title("年区切り")
sns.lineplot(data=train, x="datetime", y="y", hue="year", palette="deep", ax=axes[0])

axes[1].set_title("年度区切り")
sns.lineplot(data=train, x="datetime", y="y", hue="fy", palette="deep", ax=axes[1])

plt.show()

# y==0、close==1のデータを抽出。引っ越し件数が0件または休業日の場合
with pl.Config(tbl_rows=-1):
    print("train\n", train.filter((pl.col("y") == 0) | (pl.col("close") == 1)))

train
 shape: (29, 10)
┌────────────┬────────────┬─────┬────────┬───────┬──────────┬──────────┬─────────┬──────┬──────┐
│ id         ┆ datetime   ┆ y   ┆ client ┆ close ┆ price_am ┆ price_pm ┆ is_busy ┆ year ┆ fy   │
│ ---        ┆ ---        ┆ --- ┆ ---    ┆ ---   ┆ ---      ┆ ---      ┆ ---     ┆ ---  ┆ ---  │
│ str        ┆ date       ┆ i64 ┆ i64    ┆ i64   ┆ i64      ┆ i64      ┆ bool    ┆ i32  ┆ i32  │
╞════════════╪════════════╪═════╪════════╪═══════╪══════════╪══════════╪═════════╪══════╪══════╡
│ 2010-08-18 ┆ 2010-08-18 ┆ 0   ┆ 0      ┆ 0     ┆ -1       ┆ -1       ┆ false   ┆ 2010 ┆ 2010 │
│ 2010-12-31 ┆ 2010-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2010 ┆ 2010 │
│ 2011-01-01 ┆ 2011-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2011 ┆ 2010 │
│ 2011-01-02 ┆ 2011-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2011 ┆ 2010 │
│ 2011-01-03 ┆ 2011-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2011 ┆ 2010 │
│ 2011-08-14 ┆ 2011-08-14 ┆ 0   ┆ 0      ┆ 0     ┆ 0        ┆ 0        ┆ false   ┆ 2011 ┆ 2011 │
│ 2011-12-31 ┆ 2011-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2011 ┆ 2011 │
│ 2012-01-01 ┆ 2012-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2012 ┆ 2011 │
│ 2012-01-02 ┆ 2012-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2012 ┆ 2011 │
│ 2012-01-03 ┆ 2012-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2012 ┆ 2011 │
│ 2012-12-31 ┆ 2012-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2012 ┆ 2012 │
│ 2013-01-01 ┆ 2013-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2013 ┆ 2012 │
│ 2013-01-02 ┆ 2013-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2013 ┆ 2012 │
│ 2013-01-03 ┆ 2013-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2013 ┆ 2012 │
│ 2013-08-12 ┆ 2013-08-12 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2013 ┆ 2013 │
│ 2013-12-31 ┆ 2013-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2013 ┆ 2013 │
│ 2014-01-01 ┆ 2014-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2014 ┆ 2013 │
│ 2014-01-02 ┆ 2014-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2014 ┆ 2013 │
│ 2014-01-03 ┆ 2014-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2014 ┆ 2013 │
│ 2014-08-12 ┆ 2014-08-12 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-12-31 ┆ 2014-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2014 ┆ 2014 │
│ 2015-01-01 ┆ 2015-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2015 ┆ 2014 │
│ 2015-01-02 ┆ 2015-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2015 ┆ 2014 │
│ 2015-01-03 ┆ 2015-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2015 ┆ 2014 │
│ 2015-08-12 ┆ 2015-08-12 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2015 ┆ 2015 │
│ 2015-12-31 ┆ 2015-12-31 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2015 ┆ 2015 │
│ 2016-01-01 ┆ 2016-01-01 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2016 ┆ 2015 │
│ 2016-01-02 ┆ 2016-01-02 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2016 ┆ 2015 │
│ 2016-01-03 ┆ 2016-01-03 ┆ 0   ┆ 0      ┆ 1     ┆ -1       ┆ -1       ┆ false   ┆ 2016 ┆ 2015 │
└────────────┴────────────┴─────┴────────┴───────┴──────────┴──────────┴─────────┴──────┴──────┘

# 休業日の場合は引っ越し数0。これはルールベースで予測することとする。そのため学習からは除外

# trainからpl.col("y"==0)である部分を除外する
# 2010-08-18と2011-08-14はお盆であるものの休業日に設定されていなかったが、
# 以降の年では休業日に設定されたみたいなので学習からは取り除いておく
train = train.filter(
    (pl.col("close") != 1)
    & (pl.col("id") != "2010-08-18")
    & (pl.col("id") != "2011-08-14")
)

# test_close: testデータの中で休業日の部分(pl.col("close"==1))を抜き出し、
# その日は引っ越し数0(pl.col("y"==0))であるとしたdataframe
# 元のtestからはその部分は取り除いておく
test_close = test.filter(pl.col("close") == 1)[["id"]]
test_close = test_close.with_columns(pl.Series("y", [0.0] * len(test_close)))
test = test.filter(pl.col("close") != 1)

# ちゃんと休業日関連が消せているか確認
with pl.Config(tbl_rows=-1):
    print("train\n", train.filter((pl.col("y") == 0) | (pl.col("close") == 1)))
    print("test\n", test.filter((pl.col("close") == 1)))

train
 shape: (0, 10)
┌─────┬──────────┬─────┬────────┬───────┬──────────┬──────────┬─────────┬──────┬─────┐
│ id  ┆ datetime ┆ y   ┆ client ┆ close ┆ price_am ┆ price_pm ┆ is_busy ┆ year ┆ fy  │
│ --- ┆ ---      ┆ --- ┆ ---    ┆ ---   ┆ ---      ┆ ---      ┆ ---     ┆ ---  ┆ --- │
│ str ┆ date     ┆ i64 ┆ i64    ┆ i64   ┆ i64      ┆ i64      ┆ bool    ┆ i32  ┆ i32 │
╞═════╪══════════╪═════╪════════╪═══════╪══════════╪══════════╪═════════╪══════╪═════╡
└─────┴──────────┴─────┴────────┴───────┴──────────┴──────────┴─────────┴──────┴─────┘
test
 shape: (0, 6)
┌─────┬──────────┬────────┬───────┬──────────┬──────────┐
│ id  ┆ datetime ┆ client ┆ close ┆ price_am ┆ price_pm │
│ --- ┆ ---      ┆ ---    ┆ ---   ┆ ---      ┆ ---      │
│ str ┆ date     ┆ i64    ┆ i64   ┆ i64      ┆ i64      │
╞═════╪══════════╪════════╪═══════╪══════════╪══════════╡
└─────┴──────────┴────────┴───────┴──────────┴──────────┘

# 分離が済んだので、close列は削除
train = train.drop("close")
test = test.drop("close")

fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[2, 1],
    figsize=(12, 4),
    constrained_layout=True,
)
plt.suptitle("clientフラグについて\n引越し数 y の推移と分布")

axes[0].set_title("推移")
axes[0].tick_params(axis="x", rotation=90)
sns.histplot(data=train, x="y", hue="client", ax=axes[1])

axes[1].set_title("分布")
sns.lineplot(data=train, x="datetime", y="y", hue="client", ax=axes[0])

plt.show()
print(train.filter(pl.col("client") == 1))

shape: (206, 9)
┌────────────┬────────────┬─────┬────────┬──────────┬──────────┬─────────┬──────┬──────┐
│ id         ┆ datetime   ┆ y   ┆ client ┆ price_am ┆ price_pm ┆ is_busy ┆ year ┆ fy   │
│ ---        ┆ ---        ┆ --- ┆ ---    ┆ ---      ┆ ---      ┆ ---     ┆ ---  ┆ ---  │
│ str        ┆ date       ┆ i64 ┆ i64    ┆ i64      ┆ i64      ┆ bool    ┆ i32  ┆ i32  │
╞════════════╪════════════╪═════╪════════╪══════════╪══════════╪═════════╪══════╪══════╡
│ 2014-05-20 ┆ 2014-05-20 ┆ 19  ┆ 1      ┆ 0        ┆ 0        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-06-14 ┆ 2014-06-14 ┆ 45  ┆ 1      ┆ 1        ┆ 1        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-07-10 ┆ 2014-07-10 ┆ 30  ┆ 1      ┆ 0        ┆ 0        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-07-19 ┆ 2014-07-19 ┆ 41  ┆ 1      ┆ 2        ┆ 1        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-07-23 ┆ 2014-07-23 ┆ 40  ┆ 1      ┆ 0        ┆ 0        ┆ false   ┆ 2014 ┆ 2014 │
│ …          ┆ …          ┆ …   ┆ …      ┆ …        ┆ …        ┆ …       ┆ …    ┆ …    │
│ 2016-03-27 ┆ 2016-03-27 ┆ 94  ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-28 ┆ 2016-03-28 ┆ 86  ┆ 1      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-29 ┆ 2016-03-29 ┆ 98  ┆ 1      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-30 ┆ 2016-03-30 ┆ 99  ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-31 ┆ 2016-03-31 ┆ 105 ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
└────────────┴────────────┴─────┴────────┴──────────┴──────────┴─────────┴──────┴──────┘

# 期間を絞り込んで表示。
fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[2, 1],
    figsize=(12, 4),
    constrained_layout=True,
)
plt.suptitle("clientフラグについて\n引越し数 y の推移と分布(2014-05-01以降)")

axes[0].set_title("推移")
axes[0].tick_params(axis="x", rotation=90)
sns.histplot(
    data=train.filter(pl.col("datetime") >= date(2014, 5, 1)),
    x="y",
    hue="client",
    ax=axes[1],
)

axes[1].set_title("分布")
sns.lineplot(
    data=train.filter(pl.col("datetime") >= date(2014, 5, 1)),
    x="datetime",
    y="y",
    hue="client",
    ax=axes[0],
)

plt.show()

print(
    "client=1(法人あり)のときの引っ越し数yの平均",
    np.round(
        train.filter(pl.col("datetime") >= date(2014, 5, 1))
        .filter(pl.col("client") == 1)["y"]
        .mean(),
        decimals=2,
    ),
)
print(
    "client=0(通常時)のときの引っ越し数yの平均",
    np.round(
        train.filter(pl.col("datetime") >= date(2014, 5, 1))
        .filter(pl.col("client") == 0)["y"]
        .mean(),
        decimals=2,
    ),
)

client=1(法人あり)のときの引っ越し数yの平均 50.58
client=0(通常時)のときの引っ越し数yの平均 41.26

graph_color = px.colors.sequential.Magma_r
# https://oeconomicus.jp/2021/07/plotly-color-scale/

# pandas 化
df = train.to_pandas().copy()

# すべてのカテゴリ（午前/午後の和集合）で色を固定
cats_all = sorted(set(df["price_am"].unique()) | set(df["price_pm"].unique()))
colors = graph_color[: len(cats_all)]
# Plotlyのcolor_discrete_mapはキーが文字列比較になることが多いので安全のため文字列化
color_map = {str(c): colors[i] for i, c in enumerate(cats_all)}

# 描画用に文字列列を用意（凡例名/trace.nameの一致を安定させる）
df["price_am_str"] = df["price_am"].astype(str)
df["price_pm_str"] = df["price_pm"].astype(str)

# 大枠（2行×2列）
fig = make_subplots(
    rows=2,
    cols=2,
    row_heights=[0.5, 0.5],
    column_widths=[0.7, 0.3],
    vertical_spacing=0.15,
    horizontal_spacing=0.05,
    subplot_titles=("午前: 推移", "午前: 分布", "午後: 推移", "午後: 分布"),
)

fig.update_layout(
    width=1500,
    height=900,
    title_text="午前/午後 料金区分別<br>引越し数 y の推移と分布",
    showlegend=True,
    legend_tracegroupgap=12,
)


# 指定された行に対して、左に推移(scatter)、右に分布(histogram)を追加する関数
def add_section(colname_str: str, row: int):
    # 左：推移（scatter）
    scatter_fig = px.scatter(
        df,
        x="datetime",
        y="y",
        color=colname_str,
        category_orders={colname_str: [str(c) for c in cats_all]},
        color_discrete_map=color_map,
    )
    for tr in scatter_fig.data:
        # カテゴリごとに legendgroup を統一
        tr.legendgroup = tr.name  # カテゴリ単位のグループ化
        # tr.marker.symbol = "cross"               # マーカー形状を変更 https://plotly.com/python/marker-style/
        tr.showlegend = row == 1  # 凡例は上段のみ表示
        fig.add_trace(tr, row=row, col=1)

    # 右：分布（histogram）
    hist_fig = px.histogram(
        df,
        x="y",
        color=colname_str,
        category_orders={colname_str: [str(c) for c in cats_all]},
        color_discrete_map=color_map,
    )
    for tr in hist_fig.data:
        tr.legendgroup = tr.name  # 同じカテゴリ名でグルーピング
        tr.showlegend = False  # 凡例は出さないが連動はする
        fig.add_trace(tr, row=row, col=2)

    # 軸ラベル
    fig.update_xaxes(title_text="datetime", row=row, col=1)
    fig.update_yaxes(title_text="y", row=row, col=1)
    fig.update_xaxes(title_text="y", row=row, col=2)
    fig.update_yaxes(title_text="count", row=row, col=2)


# 上段: 午前
add_section("price_am_str", row=1)
# 下段: 午後
add_section("price_pm_str", row=2)

fig.show()

del df, cats_all, colors, color_map

print("2010年のデータ数", len(train.filter(pl.col("datetime") <= date(2010, 12, 31))))
print(
    "2010年でprice_am=-1(欠損)となっているデータ数",
    len(
        (train.filter(pl.col("datetime") <= date(2010, 12, 31))).filter(
            pl.col("price_am") == -1
        )
    ),
)
print(
    "2010年でprice_pm=-1(欠損)となっているデータ数",
    len(
        (train.filter(pl.col("datetime") <= date(2010, 12, 31))).filter(
            pl.col("price_pm") == -1
        )
    ),
)

2010年のデータ数 182
2010年でprice_am=-1(欠損)となっているデータ数 182
2010年でprice_pm=-1(欠損)となっているデータ数 182

# 料金区分price_am, price_pmが3以上のものを詳しく見てみる
# そこまで数は多くなさそうなので全件表示
with pl.Config(tbl_rows=-1):
    print(train.filter((pl.col("price_am") >= 3) | (pl.col("price_pm") >= 3)))

shape: (114, 9)
┌────────────┬────────────┬─────┬────────┬──────────┬──────────┬─────────┬──────┬──────┐
│ id         ┆ datetime   ┆ y   ┆ client ┆ price_am ┆ price_pm ┆ is_busy ┆ year ┆ fy   │
│ ---        ┆ ---        ┆ --- ┆ ---    ┆ ---      ┆ ---      ┆ ---     ┆ ---  ┆ ---  │
│ str        ┆ date       ┆ i64 ┆ i64    ┆ i64      ┆ i64      ┆ bool    ┆ i32  ┆ i32  │
╞════════════╪════════════╪═════╪════════╪══════════╪══════════╪═════════╪══════╪══════╡
│ 2011-02-26 ┆ 2011-02-26 ┆ 32  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2011 ┆ 2010 │
│ 2011-03-19 ┆ 2011-03-19 ┆ 40  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-20 ┆ 2011-03-20 ┆ 41  ┆ 0      ┆ 5        ┆ 3        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-21 ┆ 2011-03-21 ┆ 39  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-22 ┆ 2011-03-22 ┆ 39  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-23 ┆ 2011-03-23 ┆ 37  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-24 ┆ 2011-03-24 ┆ 36  ┆ 0      ┆ 5        ┆ 3        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-25 ┆ 2011-03-25 ┆ 35  ┆ 0      ┆ 4        ┆ 5        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-26 ┆ 2011-03-26 ┆ 40  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-27 ┆ 2011-03-27 ┆ 35  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-28 ┆ 2011-03-28 ┆ 49  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-29 ┆ 2011-03-29 ┆ 37  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-30 ┆ 2011-03-30 ┆ 38  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-03-31 ┆ 2011-03-31 ┆ 34  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2011 ┆ 2010 │
│ 2011-04-01 ┆ 2011-04-01 ┆ 48  ┆ 0      ┆ 4        ┆ 2        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-04-02 ┆ 2011-04-02 ┆ 29  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-04-03 ┆ 2011-04-03 ┆ 28  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-04-09 ┆ 2011-04-09 ┆ 39  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-04-23 ┆ 2011-04-23 ┆ 26  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-04-30 ┆ 2011-04-30 ┆ 21  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2011 ┆ 2011 │
│ 2011-12-30 ┆ 2011-12-30 ┆ 15  ┆ 0      ┆ 1        ┆ 3        ┆ false   ┆ 2011 ┆ 2011 │
│ 2012-02-25 ┆ 2012-02-25 ┆ 49  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2012 ┆ 2011 │
│ 2012-03-17 ┆ 2012-03-17 ┆ 58  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-18 ┆ 2012-03-18 ┆ 45  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-20 ┆ 2012-03-20 ┆ 51  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-22 ┆ 2012-03-22 ┆ 49  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-23 ┆ 2012-03-23 ┆ 47  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-24 ┆ 2012-03-24 ┆ 55  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-25 ┆ 2012-03-25 ┆ 49  ┆ 0      ┆ 4        ┆ 5        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-26 ┆ 2012-03-26 ┆ 69  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-27 ┆ 2012-03-27 ┆ 88  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-29 ┆ 2012-03-29 ┆ 63  ┆ 0      ┆ 4        ┆ 2        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-30 ┆ 2012-03-30 ┆ 69  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-03-31 ┆ 2012-03-31 ┆ 63  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2012 ┆ 2011 │
│ 2012-04-01 ┆ 2012-04-01 ┆ 36  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2012 ┆ 2012 │
│ 2012-04-02 ┆ 2012-04-02 ┆ 45  ┆ 0      ┆ 2        ┆ 3        ┆ true    ┆ 2012 ┆ 2012 │
│ 2012-04-07 ┆ 2012-04-07 ┆ 39  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2012 ┆ 2012 │
│ 2012-04-29 ┆ 2012-04-29 ┆ 54  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2012 ┆ 2012 │
│ 2012-07-28 ┆ 2012-07-28 ┆ 48  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2012 ┆ 2012 │
│ 2012-09-29 ┆ 2012-09-29 ┆ 53  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2012 ┆ 2012 │
│ 2012-12-23 ┆ 2012-12-23 ┆ 46  ┆ 0      ┆ 2        ┆ 3        ┆ false   ┆ 2012 ┆ 2012 │
│ 2013-02-23 ┆ 2013-02-23 ┆ 56  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2013 ┆ 2012 │
│ 2013-03-16 ┆ 2013-03-16 ┆ 66  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-20 ┆ 2013-03-20 ┆ 74  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-21 ┆ 2013-03-21 ┆ 71  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-22 ┆ 2013-03-22 ┆ 65  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-23 ┆ 2013-03-23 ┆ 60  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-24 ┆ 2013-03-24 ┆ 66  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-25 ┆ 2013-03-25 ┆ 76  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-26 ┆ 2013-03-26 ┆ 81  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-27 ┆ 2013-03-27 ┆ 74  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-28 ┆ 2013-03-28 ┆ 83  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-29 ┆ 2013-03-29 ┆ 71  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-30 ┆ 2013-03-30 ┆ 84  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-03-31 ┆ 2013-03-31 ┆ 69  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2013 ┆ 2012 │
│ 2013-04-01 ┆ 2013-04-01 ┆ 67  ┆ 0      ┆ 4        ┆ 2        ┆ true    ┆ 2013 ┆ 2013 │
│ 2013-04-06 ┆ 2013-04-06 ┆ 60  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2013 ┆ 2013 │
│ 2014-02-22 ┆ 2014-02-22 ┆ 50  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2014 ┆ 2013 │
│ 2014-03-15 ┆ 2014-03-15 ┆ 70  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-16 ┆ 2014-03-16 ┆ 84  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-21 ┆ 2014-03-21 ┆ 78  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-22 ┆ 2014-03-22 ┆ 69  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-23 ┆ 2014-03-23 ┆ 77  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-24 ┆ 2014-03-24 ┆ 81  ┆ 0      ┆ 3        ┆ 3        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-25 ┆ 2014-03-25 ┆ 72  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-26 ┆ 2014-03-26 ┆ 88  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-27 ┆ 2014-03-27 ┆ 69  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-28 ┆ 2014-03-28 ┆ 62  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-29 ┆ 2014-03-29 ┆ 58  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-30 ┆ 2014-03-30 ┆ 46  ┆ 0      ┆ 5        ┆ 5        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-03-31 ┆ 2014-03-31 ┆ 53  ┆ 0      ┆ 4        ┆ 5        ┆ true    ┆ 2014 ┆ 2013 │
│ 2014-04-01 ┆ 2014-04-01 ┆ 44  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-04-02 ┆ 2014-04-02 ┆ 41  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-04-05 ┆ 2014-04-05 ┆ 56  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-04-06 ┆ 2014-04-06 ┆ 45  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-04-12 ┆ 2014-04-12 ┆ 60  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-04-26 ┆ 2014-04-26 ┆ 51  ┆ 0      ┆ 4        ┆ 3        ┆ true    ┆ 2014 ┆ 2014 │
│ 2014-07-26 ┆ 2014-07-26 ┆ 44  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-09-27 ┆ 2014-09-27 ┆ 54  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2014 ┆ 2014 │
│ 2014-12-27 ┆ 2014-12-27 ┆ 37  ┆ 1      ┆ 3        ┆ 2        ┆ false   ┆ 2014 ┆ 2014 │
│ 2015-02-28 ┆ 2015-02-28 ┆ 86  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2015 ┆ 2014 │
│ 2015-03-21 ┆ 2015-03-21 ┆ 83  ┆ 1      ┆ 3        ┆ 3        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-22 ┆ 2015-03-22 ┆ 84  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-23 ┆ 2015-03-23 ┆ 88  ┆ 0      ┆ 4        ┆ 2        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-24 ┆ 2015-03-24 ┆ 109 ┆ 0      ┆ 5        ┆ 2        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-25 ┆ 2015-03-25 ┆ 78  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-26 ┆ 2015-03-26 ┆ 85  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-27 ┆ 2015-03-27 ┆ 95  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-28 ┆ 2015-03-28 ┆ 88  ┆ 1      ┆ 5        ┆ 5        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-29 ┆ 2015-03-29 ┆ 96  ┆ 0      ┆ 5        ┆ 4        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-30 ┆ 2015-03-30 ┆ 87  ┆ 1      ┆ 4        ┆ 3        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-03-31 ┆ 2015-03-31 ┆ 75  ┆ 1      ┆ 4        ┆ 3        ┆ true    ┆ 2015 ┆ 2014 │
│ 2015-04-01 ┆ 2015-04-01 ┆ 61  ┆ 1      ┆ 3        ┆ 3        ┆ true    ┆ 2015 ┆ 2015 │
│ 2015-04-04 ┆ 2015-04-04 ┆ 86  ┆ 1      ┆ 4        ┆ 3        ┆ true    ┆ 2015 ┆ 2015 │
│ 2015-04-11 ┆ 2015-04-11 ┆ 69  ┆ 1      ┆ 3        ┆ 3        ┆ true    ┆ 2015 ┆ 2015 │
│ 2015-04-25 ┆ 2015-04-25 ┆ 56  ┆ 1      ┆ 4        ┆ 3        ┆ true    ┆ 2015 ┆ 2015 │
│ 2015-04-26 ┆ 2015-04-26 ┆ 54  ┆ 0      ┆ 3        ┆ 2        ┆ true    ┆ 2015 ┆ 2015 │
│ 2015-07-25 ┆ 2015-07-25 ┆ 51  ┆ 0      ┆ 3        ┆ 2        ┆ false   ┆ 2015 ┆ 2015 │
│ 2016-02-27 ┆ 2016-02-27 ┆ 83  ┆ 1      ┆ 3        ┆ 2        ┆ false   ┆ 2016 ┆ 2015 │
│ 2016-03-17 ┆ 2016-03-17 ┆ 87  ┆ 0      ┆ 2        ┆ 3        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-18 ┆ 2016-03-18 ┆ 102 ┆ 0      ┆ 2        ┆ 3        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-19 ┆ 2016-03-19 ┆ 94  ┆ 1      ┆ 3        ┆ 3        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-20 ┆ 2016-03-20 ┆ 101 ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-21 ┆ 2016-03-21 ┆ 100 ┆ 1      ┆ 3        ┆ 3        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-22 ┆ 2016-03-22 ┆ 96  ┆ 0      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-23 ┆ 2016-03-23 ┆ 96  ┆ 1      ┆ 2        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-24 ┆ 2016-03-24 ┆ 94  ┆ 1      ┆ 3        ┆ 2        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-25 ┆ 2016-03-25 ┆ 90  ┆ 1      ┆ 4        ┆ 5        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-26 ┆ 2016-03-26 ┆ 93  ┆ 1      ┆ 5        ┆ 5        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-27 ┆ 2016-03-27 ┆ 94  ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-28 ┆ 2016-03-28 ┆ 86  ┆ 1      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-29 ┆ 2016-03-29 ┆ 98  ┆ 1      ┆ 4        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-30 ┆ 2016-03-30 ┆ 99  ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
│ 2016-03-31 ┆ 2016-03-31 ┆ 105 ┆ 1      ┆ 5        ┆ 4        ┆ true    ┆ 2016 ┆ 2015 │
└────────────┴────────────┴─────┴────────┴──────────┴──────────┴─────────┴──────┴──────┘

# 年を経るごとにyが上昇していくため、yの分布がどんどん右にシフトしていく問題に対処したい
# そこで、prophetによる時系列予測によってyのトレンドを予測し、yのトレンドを除去する

# polarsのdatetime型をpandasのdatetime型に変換する必要がある
## 訓練データ
train_pandas = (
    train.select(["datetime", "y"])
    .rename({"datetime": "ds", "y": "y"})
    .to_pandas()
    .copy()
)
## テストデータ。datetimeカラムを渡すだけ
test_pandas = test.select(["datetime"]).rename({"datetime": "ds"}).to_pandas().copy()

# 学習。prophetでは年を超えた全体的なトレンドだけを捉えたいので、seasonalityは全てFalseにする
model = Prophet(
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=False,
    seasonality_mode="additive",
)
model.fit(train_pandas)

# 予測
# グラフ描画用
forecast_train = model.predict(train_pandas)
forecast_test = model.predict(test_pandas)

# 元データへの結合用
forecast_train_pl = pl.Series("y_trend", model.predict(train_pandas)["yhat"].values)

# trainの方にjoinして列追加
train = train.with_columns(forecast_train_pl)

15:44:11 - cmdstanpy - INFO - Chain [1] start processing
15:44:12 - cmdstanpy - INFO - Chain [1] done processing

# 予測結果の可視化
fig, ax = plt.subplots(
    figsize=(12, 5),
    constrained_layout=True,
)
ax.set_title("Prophetによる予測（Train=青, Test=赤）")

# train側は Prophet の標準描画（青・点群付き）
model.plot(model.predict(train_pandas), ax=ax)

# test側は後から重ね描き（赤）
ax.plot(
    forecast_test["ds"],
    forecast_test["yhat"],
    color="red",
    linewidth=2,
    label="Test yhat",
)
ax.fill_between(
    forecast_test["ds"],
    forecast_test["yhat_lower"],
    forecast_test["yhat_upper"],
    color="red",
    alpha=0.2,
    label="Test interval",
)

ax.legend()
plt.show()

# 後で再宣言しちゃいそうな変数はここで一度削除
del train_pandas, test_pandas, forecast_train, forecast_test, forecast_train_pl, model

# 残差列、曜日・月列を追加
train = train.with_columns(
    (pl.col("y") - pl.col("y_trend")).alias("y_resid"),
    pl.col("datetime").dt.weekday().alias("weekday"),
    pl.col("datetime").dt.month().alias("month"),
)

# トレンドとの差分を取ることで、年ごとの分布のズレが解消されるか確認する

fig, axes = plt.subplots(
    nrows=2,
    ncols=2,
    height_ratios=[1, 1],
    width_ratios=[2, 1],
    figsize=(12, 8),
    constrained_layout=True,
)
fig.suptitle("引越し数の推移と分布。元データ(上段)とトレンドとの残差(下段)の比較")

# 上段。そのまま
axes[0, 0].set_title("推移")
axes[0, 0].tick_params(axis="x", rotation=90)
sns.lineplot(data=train, x="datetime", y="y", hue="year", palette="deep", ax=axes[0, 0])
sns.lineplot(data=train, x="datetime", y="y_trend", color="0.1", ax=axes[0, 0])

axes[0, 1].set_title("分布")
sns.histplot(data=train, x="y", hue="year", palette="deep", ax=axes[0, 1])

# 下段。差分
axes[1, 0].set_title("推移(残差)")
axes[1, 0].tick_params(axis="x", rotation=90)
sns.scatterplot(
    data=train,
    x="datetime",
    y="y_resid",
    hue="year",
    palette="deep",
    marker="+",
    ax=axes[1, 0],
)

axes[1, 1].set_title("分布(残差)")
sns.histplot(data=train, x="y_resid", hue="year", palette="deep", ax=axes[1, 1])

plt.show()

# 全体としての分布がどう変わったかも確認
skew_resid = stats.skew(train["y_resid"])
kurt_resid = stats.kurtosis(train["y_resid"])

fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[2, 1],
    figsize=(12, 4),
    constrained_layout=True,
)
fig.suptitle("引っ越し数yのトレンドとの残差の推移と分布")

# 下段。差分
axes[0].set_title("推移(残差)")
axes[0].tick_params(axis="x", rotation=90)
sns.scatterplot(data=train, x="datetime", y="y_resid", marker="+", ax=axes[0])

axes[1].set_title("分布(残差)")
sns.histplot(data=train, x="y_resid", ax=axes[1])
axes[1].text(
    0.6,
    0.8,
    f"skewness={skew_resid:.2f}\nkurtosis={kurt_resid:.2f}",
    fontsize=12,
    transform=axes[1].transAxes,
)

plt.show()

# 曜日・月ごとの残差分布を確認
fig, axes = plt.subplots(
    nrows=1,
    ncols=2,
    height_ratios=[1],
    width_ratios=[1, 1],
    figsize=(8, 4),
    constrained_layout=True,
)
# fig.suptitle("")

axes[0].set_title("曜日別の残差分布")
sns.boxplot(x="weekday", y="y_resid", data=train, ax=axes[0])

axes[1].set_title("月別の残差分布")
sns.boxplot(x="month", y="y_resid", data=train, ax=axes[1])

plt.show()

# 数値変換の定義（カラム名, 変換式）
transforms = {
    "y_ln": pl.col("y").log(),
    "y_bc_1_2": (pl.col("y").pow(1 / 2) - 1) / (1 / 2),
    "y_bc_1_10": (pl.col("y").pow(1 / 10) - 1) / (1 / 10),
}

moments = {}  # 変換ごとの skew / kurtosis を格納
results = {}  # 変換ごとの prophet による予測結果を保存

for name, expr in transforms.items():
    # 変換列をtrainに追加
    train = train.with_columns(expr.alias(name))

    # prophet用にpandas化したものを用意
    df = (
        train.select(["datetime", name])
        .rename({"datetime": "ds", name: "y"})
        .to_pandas()
        .copy()
    )

    # Prophet モデル学習
    model = Prophet(
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
        seasonality_mode="additive",
    )
    model.fit(df)

    # 予測結果を保存
    results[name] = model.predict(df)
    result_pl = pl.Series(f"{name}_trend", results[name]["yhat"].values)

    # trainに予測結果を追加
    train = train.with_columns(result_pl)

    # 残差列を追加
    train = train.with_columns(
        (pl.col(name) - pl.col(f"{name}_trend")).alias(f"{name}_resid")
    )

    # 統計量を計算したい列を選択
    x = train[f"{name}_resid"].to_numpy()
    # bias=False で標本推定、fisher=True で excess（正規=0）
    moments[name] = {
        "skew": float(stats.skew(x, bias=True)),
        "kurt": float(stats.kurtosis(x, fisher=True, bias=True)),
    }

15:44:16 - cmdstanpy - INFO - Chain [1] start processing
15:44:16 - cmdstanpy - INFO - Chain [1] done processing
15:44:17 - cmdstanpy - INFO - Chain [1] start processing
15:44:17 - cmdstanpy - INFO - Chain [1] done processing
15:44:17 - cmdstanpy - INFO - Chain [1] start processing
15:44:17 - cmdstanpy - INFO - Chain [1] done processing

# 描画

fig, axes = plt.subplots(
    nrows=len(transforms),
    ncols=2,
    height_ratios=[1, 1, 1],
    width_ratios=[2, 1],
    figsize=(15, 4 * len(transforms)),
    constrained_layout=True,
)
fig.suptitle("各種数値変換を行った y の、トレンドとの残差の推移と分布", fontsize=16)

# ループで各変換を可視化
for i, name in enumerate(transforms.keys()):
    resid_col = f"{name}_resid"

    # --- 残差の推移 ---
    axes[i, 0].set_title(f"推移 ({name})")
    sns.lineplot(
        data=train.to_pandas(), x="datetime", y=resid_col, ax=axes[i, 0], linewidth=1
    )
    axes[i, 0].axhline(0, color="red", linestyle="--", linewidth=1)

    # --- 残差の分布 ---
    axes[i, 1].set_title(f"分布 ({name})")
    sns.histplot(
        data=train.to_pandas(),
        x=resid_col,
        kde=True,
        stat="density",
        ax=axes[i, 1],
    )
    axes[i, 1].axvline(0, color="red", linestyle="--", linewidth=1)

    # skew / kurt を表示
    m = moments[name]
    axes[i, 1].text(
        0.05,
        0.95,
        f"skew={m['skew']:.2f}\nkurt={m['kurt']:.2f}",
        transform=axes[i, 1].transAxes,
        ha="left",
        va="top",
        fontsize=11,
        color="black",
        bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
    )

plt.show()

# y_residを数値変換→残差分析といった順番でやった版。尖度・歪度がおかしくなったので没

# # 数値変換の定義（カラム名, 変換式）
# transforms = {
#     "y_resid_ln": pl.col("y_resid").log(),
#     "y_resid_bc_1_2": (pl.col("y_resid").pow(1/2) - 1) / (1/2),
#     "y_resid_bc_1_10": (pl.col("y_resid").pow(1/10) - 1) / (1/10),
# }

# moments = {}   # 変換ごとの skew / kurtosis を格納
# results = {}   # 変換ごとの prophet による予測結果を保存

# for name, expr in transforms.items():
#     # 変換列をtrainに追加
#     train = train.with_columns(expr.alias(name))

#     # 統計量を計算
#     x = train[name].to_numpy()
#     # bias=False で標本推定、fisher=True で excess（正規=0）
#     moments[name] = {
#         "skew": float(stats.skew(x, bias=False)),
#         "kurt": float(stats.kurtosis(x, fisher=True, bias=False)),
#     }

# # 描画設定
# fig, axes = plt.subplots(
#     nrows=len(transforms), ncols=2,
#     figsize=(15, 4 * len(transforms)),
#     constrained_layout=True,
# )
# fig.suptitle("各種数値変換を行った y の、トレンドとの残差の推移と分布", fontsize=16)

# # ループで各変換を可視化
# for i, name in enumerate(transforms.keys()):
#     # --- 残差の推移 ---
#     axes[i, 0].set_title(f"推移 ({name})")
#     sns.lineplot(
#         data=train.to_pandas(),
#         x="datetime", y=name,
#         ax=axes[i, 0], linewidth=1
#     )
#     axes[i, 0].axhline(0, color="red", linestyle="--", linewidth=1)

#     # --- 残差の分布 ---
#     axes[i, 1].set_title(f"分布 ({name})")
#     sns.histplot(
#         data=train.to_pandas(),
#         x=name,
#         kde=True, stat="density",
#         ax=axes[i, 1],
#     )
#     axes[i, 1].axvline(0, color="red", linestyle="--", linewidth=1)

#     # skew / kurt を表示
#     m = moments[name]
#     axes[i, 1].text(
#         0.05, 0.95,
#         f"skew={m['skew']:.2f}\nkurt={m['kurt']:.2f}",
#         transform=axes[i, 1].transAxes,
#         ha="left", va="top",
#         fontsize=11, color="black",
#         bbox=dict(boxstyle="round", facecolor="white", alpha=0.7),
#     )

# plt.show()

# import ipynbname
# from pathlib import Path

# output_dir = Path("html")

# !jupyter nbconvert --to html "{str(ipynbname.path())}" --output-dir "{output_dir}"

ノートブックの概要¶

ライブラリのインポート¶

データ読み込んでみる¶

時系列データを可視化¶

時間軸をどこで区切るべきか(年？年度？)¶

引っ越し件数0の部分(→休業日と判明)¶

clientフラグについて¶

料金区分(午前、午後)¶

料金区分price_am, price_pmが3以上のものを詳しく見てみる¶

トレンドを捉える(prophet)¶

トレンドとの残差を確認¶

数値変換を試す¶

没：y_residを数値変換→分析といった順番でやった版¶

EDAのまとめ¶

参考サイト¶

htmlに変換したものを出力¶