import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

/tmp/ipykernel_78301/2972804336.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd

df = pd.read_csv('obesity_data.csv')

df.describe(include='all')

df.head(10)

unique_obesitycategory_values = df['ObesityCategory'].unique()

print("Unique values in the 'Obesity Category' column:")
print(unique_obesitycategory_values)

fig = px.pie(unique_obesitycategory_values, names=df['ObesityCategory'], hole=0.48,
             color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=750, height=750)

# Show the plot
fig.show()

Unique values in the 'Obesity Category' column:
['Normal weight' 'Obese' 'Overweight' 'Underweight']

unique_gender_values = df['Gender'].unique()

print("Unique values in the 'Gender' column:")
print(unique_gender_values)

fig = px.pie(unique_gender_values, names=df['Gender'], hole=0.64, color_discrete_map={'Male': 'blue', 'Female': 'red'}, color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()

Unique values in the 'Gender' column:
['Male' 'Female']

df.head(5)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    1000 non-null   int64  
 1   Gender                 1000 non-null   object 
 2   Height                 1000 non-null   float64
 3   Weight                 1000 non-null   float64
 4   BMI                    1000 non-null   float64
 5   PhysicalActivityLevel  1000 non-null   int64  
 6   ObesityCategory        1000 non-null   object 
dtypes: float64(3), int64(2), object(2)
memory usage: 54.8+ KB

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    1000 non-null   int64  
 1   Gender                 1000 non-null   object 
 2   Height                 1000 non-null   float64
 3   Weight                 1000 non-null   float64
 4   BMI                    1000 non-null   float64
 5   PhysicalActivityLevel  1000 non-null   int64  
 6   ObesityCategory        1000 non-null   object 
dtypes: float64(3), int64(2), object(2)
memory usage: 54.8+ KB

df['Height'] = df['Height'].round().astype('int64')
df['Weight'] = df['Weight'].round().astype('int64')

fig = px.parallel_coordinates(
    pd.concat([df.select_dtypes(int), df['PhysicalActivityLevel']], axis=1),
    color="Age",
    color_continuous_scale=px.colors.sequential.Rainbow,
    color_continuous_midpoint=50
)

fig.update_layout(template='plotly_dark', width=1200, height=500)  # Adjust width and height as needed
fig.show()
fig.write_html("path/to/your/visualization.html")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 df['Height'] = df['Height'].round().astype('int64')
      2 df['Weight'] = df['Weight'].round().astype('int64')
      4 fig = px.parallel_coordinates(
      5     pd.concat([df.select_dtypes(int), df['PhysicalActivityLevel']], axis=1),
      6     color="Age",
      7     color_continuous_scale=px.colors.sequential.Rainbow,
      8     color_continuous_midpoint=50
      9 )

NameError: name 'df' is not defined

fig = px.scatter_3d(
    df,
    x='Weight',
    y='Height',
    z='PhysicalActivityLevel',
    color='PhysicalActivityLevel',
    width=800,
    height=600,
    title='Enhanced 3D Scatter Plot of Weight, Height, and Physical Activity Level',
    color_continuous_scale=px.colors.sequential.Jet
)

# Set dark theme
fig.update_layout(template='plotly_dark', width=750, height=750)

fig.show()

fig = px.scatter_3d(
    df,
    x='Age',
    y='BMI',
    z='ObesityCategory',
    color='Age',
    width=800,
    height=600,
    title='Enhanced 3D Scatter Plot of Weight, Height, and Physical Activity Level',
    color_continuous_scale=px.colors.sequential.Jet
)

# Set dark theme
fig.update_layout(template='plotly_dark', width=750, height=750)

fig.show()

fig = px.bar(
    df,
    x='PhysicalActivityLevel',
    y='Age',
    title='Bar Chart of Age and Physical Activity Level',
    color='Age',
    color_continuous_scale='Jet',
    labels={'Age': 'Age (years)',}
)

# Set dark theme
fig.update_layout(template='plotly_dark', width=1214, height=750)

fig = px.area(df, x='Age', y='BMI', line_group='PhysicalActivityLevel', color='PhysicalActivityLevel',
              labels={'Age': 'Age (years)', 'BMI': 'BMI'},
              title='BMI Over Age with Physical Activity Level',
              color_discrete_sequence=px.colors.sequential.Jet)  # Apply 'Jet' color scheme

# Set dark theme
fig.update_layout(template='plotly_dark', width=1214, height=750)

# Make the graph more granular
fig.update_xaxes(tickvals=[25, 30, 35, 40, 45, 50], tickmode='array')  # Adjust x-axis ticks as needed

# Show the plot
fig.show()

fig = px.box(df, x='Age', y='BMI', color='Age', color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=1214, height=750)
fig.show()

# Define the age bins and labels
bins = [16, 18, 20, 22, 24, 26, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels = ['16-18', '18-20', '20-22', '22-24', '24-26', '26-28', '28-30', '30-35', '35-40', '40-45', '45-50',
          '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '80-85', '85-90', '90-95', '95-100']

# Create a new column 'Age_Category' based on the bins
df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

# Create the box plot
fig = px.box(df, x='Age_Category', y='Weight', color='Age_Category', color_discrete_sequence=px.colors.sequential.Jet)

# Customize the layout
fig.update_layout(template='plotly_dark', width=1214, height=750)

# Show the plot
fig.show()

fig = px.density_heatmap(df, x='Age', y='BMI', nbinsx=25, nbinsy=25, color_continuous_scale=px.colors.sequential.Jet)

# Update the layout of the figure
fig.update_layout(template='plotly_dark', width=750, height=750)

# Display the figure
fig.show()

bins = [24, 50, 80, 110, 120]
labels = ['0-24', '24-50', '50-80', '80-110']

# Create a new column 'Weight_Category' based on the bins
df['Weight_Category'] = pd.cut(df['Weight'], bins=bins, labels=labels, right=True)

# Create a histogram using Plotly Express
fig = px.histogram(df, x='Weight_Category', color='Weight_Category',
                   title='Distribution of Weight Categories',
                   labels={'Weight_Category': 'Weight Category'},
                   template='plotly_dark', width=800, height=800,
                   color_discrete_map={'0-24': 'blue', '24-50': 'green', '50-80': 'orange', '80-110': 'red'})

# Show the plot
fig.show()

# Calculate quartiles
q1, q2, q3 = df['Height'].quantile([0.25, 0.5, 0.75])

# Specify bin edges
bins = [df['Height'].min(), q1, q2, q3, df['Height'].max()]

# Specify bin labels
labels = ['Short', 'Medium Short', 'Medium Tall', 'Tall']

# Create a new column 'Height_Category' based on the bins
df['Height_Category'] = pd.cut(df['Height'], bins=bins, labels=labels, right=False)

fig = px.histogram(df, x='Height_Category', color='Height_Category',
                   title='Distribution of Height Categories',
                   labels={'Height_Category': 'Height Category'},
                   template='plotly_dark', width=800, height=800,
                   color_discrete_map={'Medium Tall': 'blue', 'Medium Short': 'green', 'Tall': 'orange', 'Short': 'red'})

# Show the plot
fig.show()

df['HW_Ratio'] = df['Height'] / df['Weight']

fig = px.scatter(df, x='HW_Ratio', y='Height', color='Height', size='HW_Ratio',
                 template='plotly_dark', color_continuous_scale=px.colors.sequential.Jet)
fig.show()

# Assuming df is your DataFrame with columns 'BMI' and 'PhysicalActivityLevel'
df['BMI_PhysicalActivity'] = df['BMI'] * df['PhysicalActivityLevel']

fig = px.density_heatmap(df, x='BMI_PhysicalActivity', y='BMI', nbinsx=15, nbinsy=15, color_continuous_scale=px.colors.sequential.Jet)

# Update the layout of the figure
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()

fig = px.parallel_categories(df,
                             dimensions=['PhysicalActivityLevel', 'Age_Category', 'Weight_Category', 'Height_Category', 'ObesityCategory', 'Gender'],
                             color='BMI',
                             labels={'PhysicalActivityLevel': 'PhysicalActivityLevel', 'Age_Category': 'Age_Category', 'Weight_Category': 'Weight_Category',
                                     'Height_Category': 'Height_Category', 'ObesityCategory': 'ObesityCategory', 'Gender': 'Gender'},
                             color_continuous_scale='Rainbow'  # Choose your desired color scale
                             )
fig.update_layout(template='plotly_dark', width=1214, height=750)
fig.show()

fig = px.violin(df, x='Age', y='BMI_PhysicalActivity', color='Age', facet_row='Height_Category')

fig.update_layout(template='plotly_dark', width=1200, height=900)

fig.show()

	Age	Gender	Height	Weight	BMI	PhysicalActivityLevel	ObesityCategory
count	1000.000000	1000	1000.000000	1000.000000	1000.000000	1000.000000	1000
unique	NaN	2	NaN	NaN	NaN	NaN	4
top	NaN	Male	NaN	NaN	NaN	NaN	Normal weight
freq	NaN	523	NaN	NaN	NaN	NaN	371
mean	49.857000	NaN	170.052417	71.205769	24.888317	2.534000	NaN
std	18.114267	NaN	10.309971	15.509849	6.193912	1.116284	NaN
min	18.000000	NaN	136.115719	26.065730	8.470572	1.000000	NaN
25%	35.000000	NaN	163.514205	61.129629	20.918068	2.000000	NaN
50%	50.000000	NaN	169.801665	71.929072	24.698647	3.000000	NaN
75%	66.000000	NaN	177.353596	81.133746	28.732132	4.000000	NaN
max	79.000000	NaN	201.419670	118.907366	50.791898	4.000000	NaN

	Age	Gender	Height	Weight	BMI	PhysicalActivityLevel	ObesityCategory
0	56	Male	173.575262	71.982051	23.891783	4	Normal weight
1	69	Male	164.127306	89.959256	33.395209	2	Obese
2	46	Female	168.072202	72.930629	25.817737	4	Overweight
3	32	Male	168.459633	84.886912	29.912247	3	Overweight
4	60	Male	183.568568	69.038945	20.487903	3	Normal weight
5	25	Female	166.405627	61.145868	22.081628	4	Normal weight
6	78	Male	183.566334	92.208521	27.364341	3	Overweight
7	38	Male	142.875095	59.359746	29.078966	1	Overweight
8	56	Male	183.478558	75.157672	22.325577	4	Normal weight
9	75	Male	182.974061	81.533460	24.353244	2	Normal weight

Importing necessary libraries¶

Loading the dataset¶

Descriptive analytics and statistics¶

Data distribution of 'Obesity Category'¶

Gender ratio¶

Data Exploration¶

Multivariate analysis¶

Scatter Plot of Weight, Height, and Physical Activity Level¶

Scatter Plot of Weight, Height, and Physical Activity Level¶

Age Categorization¶

Correlation analysis¶

Weight Categorization¶

Height Categorization¶

Height : Weight Ratio¶

BMI and PhysicalActivityLevel Interaction¶