Importing necessary libraries¶
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
/tmp/ipykernel_78301/2972804336.py:1: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
Loading the dataset¶
In [3]:
df = pd.read_csv('obesity_data.csv')
Descriptive analytics and statistics¶
In [4]:
df.describe(include='all')
Out[4]:
Age | Gender | Height | Weight | BMI | PhysicalActivityLevel | ObesityCategory | |
---|---|---|---|---|---|---|---|
count | 1000.000000 | 1000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000 |
unique | NaN | 2 | NaN | NaN | NaN | NaN | 4 |
top | NaN | Male | NaN | NaN | NaN | NaN | Normal weight |
freq | NaN | 523 | NaN | NaN | NaN | NaN | 371 |
mean | 49.857000 | NaN | 170.052417 | 71.205769 | 24.888317 | 2.534000 | NaN |
std | 18.114267 | NaN | 10.309971 | 15.509849 | 6.193912 | 1.116284 | NaN |
min | 18.000000 | NaN | 136.115719 | 26.065730 | 8.470572 | 1.000000 | NaN |
25% | 35.000000 | NaN | 163.514205 | 61.129629 | 20.918068 | 2.000000 | NaN |
50% | 50.000000 | NaN | 169.801665 | 71.929072 | 24.698647 | 3.000000 | NaN |
75% | 66.000000 | NaN | 177.353596 | 81.133746 | 28.732132 | 4.000000 | NaN |
max | 79.000000 | NaN | 201.419670 | 118.907366 | 50.791898 | 4.000000 | NaN |
In [5]:
df.head(10)
Out[5]:
Age | Gender | Height | Weight | BMI | PhysicalActivityLevel | ObesityCategory | |
---|---|---|---|---|---|---|---|
0 | 56 | Male | 173.575262 | 71.982051 | 23.891783 | 4 | Normal weight |
1 | 69 | Male | 164.127306 | 89.959256 | 33.395209 | 2 | Obese |
2 | 46 | Female | 168.072202 | 72.930629 | 25.817737 | 4 | Overweight |
3 | 32 | Male | 168.459633 | 84.886912 | 29.912247 | 3 | Overweight |
4 | 60 | Male | 183.568568 | 69.038945 | 20.487903 | 3 | Normal weight |
5 | 25 | Female | 166.405627 | 61.145868 | 22.081628 | 4 | Normal weight |
6 | 78 | Male | 183.566334 | 92.208521 | 27.364341 | 3 | Overweight |
7 | 38 | Male | 142.875095 | 59.359746 | 29.078966 | 1 | Overweight |
8 | 56 | Male | 183.478558 | 75.157672 | 22.325577 | 4 | Normal weight |
9 | 75 | Male | 182.974061 | 81.533460 | 24.353244 | 2 | Normal weight |
Data distribution of 'Obesity Category'¶
In [6]:
unique_obesitycategory_values = df['ObesityCategory'].unique()
print("Unique values in the 'Obesity Category' column:")
print(unique_obesitycategory_values)
fig = px.pie(unique_obesitycategory_values, names=df['ObesityCategory'], hole=0.48,
color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=750, height=750)
# Show the plot
fig.show()
Unique values in the 'Obesity Category' column: ['Normal weight' 'Obese' 'Overweight' 'Underweight']
Gender ratio¶
In [6]:
unique_gender_values = df['Gender'].unique()
print("Unique values in the 'Gender' column:")
print(unique_gender_values)
fig = px.pie(unique_gender_values, names=df['Gender'], hole=0.64, color_discrete_map={'Male': 'blue', 'Female': 'red'}, color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()
Unique values in the 'Gender' column: ['Male' 'Female']
In [7]:
df.head(5)
Out[7]:
Age | Gender | Height | Weight | BMI | PhysicalActivityLevel | ObesityCategory | |
---|---|---|---|---|---|---|---|
0 | 56 | Male | 173.575262 | 71.982051 | 23.891783 | 4 | Normal weight |
1 | 69 | Male | 164.127306 | 89.959256 | 33.395209 | 2 | Obese |
2 | 46 | Female | 168.072202 | 72.930629 | 25.817737 | 4 | Overweight |
3 | 32 | Male | 168.459633 | 84.886912 | 29.912247 | 3 | Overweight |
4 | 60 | Male | 183.568568 | 69.038945 | 20.487903 | 3 | Normal weight |
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 Gender 1000 non-null object 2 Height 1000 non-null float64 3 Weight 1000 non-null float64 4 BMI 1000 non-null float64 5 PhysicalActivityLevel 1000 non-null int64 6 ObesityCategory 1000 non-null object dtypes: float64(3), int64(2), object(2) memory usage: 54.8+ KB
Data Exploration¶
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1000 non-null int64 1 Gender 1000 non-null object 2 Height 1000 non-null float64 3 Weight 1000 non-null float64 4 BMI 1000 non-null float64 5 PhysicalActivityLevel 1000 non-null int64 6 ObesityCategory 1000 non-null object dtypes: float64(3), int64(2), object(2) memory usage: 54.8+ KB
Multivariate analysis¶
In [1]:
df['Height'] = df['Height'].round().astype('int64')
df['Weight'] = df['Weight'].round().astype('int64')
fig = px.parallel_coordinates(
pd.concat([df.select_dtypes(int), df['PhysicalActivityLevel']], axis=1),
color="Age",
color_continuous_scale=px.colors.sequential.Rainbow,
color_continuous_midpoint=50
)
fig.update_layout(template='plotly_dark', width=1200, height=500) # Adjust width and height as needed
fig.show()
fig.write_html("path/to/your/visualization.html")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 df['Height'] = df['Height'].round().astype('int64') 2 df['Weight'] = df['Weight'].round().astype('int64') 4 fig = px.parallel_coordinates( 5 pd.concat([df.select_dtypes(int), df['PhysicalActivityLevel']], axis=1), 6 color="Age", 7 color_continuous_scale=px.colors.sequential.Rainbow, 8 color_continuous_midpoint=50 9 ) NameError: name 'df' is not defined
Scatter Plot of Weight, Height, and Physical Activity Level¶
In [11]:
fig = px.scatter_3d(
df,
x='Weight',
y='Height',
z='PhysicalActivityLevel',
color='PhysicalActivityLevel',
width=800,
height=600,
title='Enhanced 3D Scatter Plot of Weight, Height, and Physical Activity Level',
color_continuous_scale=px.colors.sequential.Jet
)
# Set dark theme
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()
Scatter Plot of Weight, Height, and Physical Activity Level¶
In [12]:
fig = px.scatter_3d(
df,
x='Age',
y='BMI',
z='ObesityCategory',
color='Age',
width=800,
height=600,
title='Enhanced 3D Scatter Plot of Weight, Height, and Physical Activity Level',
color_continuous_scale=px.colors.sequential.Jet
)
# Set dark theme
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()
In [13]:
fig = px.bar(
df,
x='PhysicalActivityLevel',
y='Age',
title='Bar Chart of Age and Physical Activity Level',
color='Age',
color_continuous_scale='Jet',
labels={'Age': 'Age (years)',}
)
# Set dark theme
fig.update_layout(template='plotly_dark', width=1214, height=750)
In [14]:
fig = px.area(df, x='Age', y='BMI', line_group='PhysicalActivityLevel', color='PhysicalActivityLevel',
labels={'Age': 'Age (years)', 'BMI': 'BMI'},
title='BMI Over Age with Physical Activity Level',
color_discrete_sequence=px.colors.sequential.Jet) # Apply 'Jet' color scheme
# Set dark theme
fig.update_layout(template='plotly_dark', width=1214, height=750)
# Make the graph more granular
fig.update_xaxes(tickvals=[25, 30, 35, 40, 45, 50], tickmode='array') # Adjust x-axis ticks as needed
# Show the plot
fig.show()
In [15]:
fig = px.box(df, x='Age', y='BMI', color='Age', color_discrete_sequence=px.colors.sequential.Jet)
fig.update_layout(template='plotly_dark', width=1214, height=750)
fig.show()
Age Categorization¶
In [16]:
# Define the age bins and labels
bins = [16, 18, 20, 22, 24, 26, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels = ['16-18', '18-20', '20-22', '22-24', '24-26', '26-28', '28-30', '30-35', '35-40', '40-45', '45-50',
'50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '80-85', '85-90', '90-95', '95-100']
# Create a new column 'Age_Category' based on the bins
df['Age_Category'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
# Create the box plot
fig = px.box(df, x='Age_Category', y='Weight', color='Age_Category', color_discrete_sequence=px.colors.sequential.Jet)
# Customize the layout
fig.update_layout(template='plotly_dark', width=1214, height=750)
# Show the plot
fig.show()
Correlation analysis¶
In [17]:
fig = px.density_heatmap(df, x='Age', y='BMI', nbinsx=25, nbinsy=25, color_continuous_scale=px.colors.sequential.Jet)
# Update the layout of the figure
fig.update_layout(template='plotly_dark', width=750, height=750)
# Display the figure
fig.show()
Weight Categorization¶
In [18]:
bins = [24, 50, 80, 110, 120]
labels = ['0-24', '24-50', '50-80', '80-110']
# Create a new column 'Weight_Category' based on the bins
df['Weight_Category'] = pd.cut(df['Weight'], bins=bins, labels=labels, right=True)
# Create a histogram using Plotly Express
fig = px.histogram(df, x='Weight_Category', color='Weight_Category',
title='Distribution of Weight Categories',
labels={'Weight_Category': 'Weight Category'},
template='plotly_dark', width=800, height=800,
color_discrete_map={'0-24': 'blue', '24-50': 'green', '50-80': 'orange', '80-110': 'red'})
# Show the plot
fig.show()
Height Categorization¶
In [19]:
# Calculate quartiles
q1, q2, q3 = df['Height'].quantile([0.25, 0.5, 0.75])
# Specify bin edges
bins = [df['Height'].min(), q1, q2, q3, df['Height'].max()]
# Specify bin labels
labels = ['Short', 'Medium Short', 'Medium Tall', 'Tall']
# Create a new column 'Height_Category' based on the bins
df['Height_Category'] = pd.cut(df['Height'], bins=bins, labels=labels, right=False)
fig = px.histogram(df, x='Height_Category', color='Height_Category',
title='Distribution of Height Categories',
labels={'Height_Category': 'Height Category'},
template='plotly_dark', width=800, height=800,
color_discrete_map={'Medium Tall': 'blue', 'Medium Short': 'green', 'Tall': 'orange', 'Short': 'red'})
# Show the plot
fig.show()
Height : Weight Ratio¶
In [20]:
df['HW_Ratio'] = df['Height'] / df['Weight']
fig = px.scatter(df, x='HW_Ratio', y='Height', color='Height', size='HW_Ratio',
template='plotly_dark', color_continuous_scale=px.colors.sequential.Jet)
fig.show()
BMI and PhysicalActivityLevel Interaction¶
In [21]:
# Assuming df is your DataFrame with columns 'BMI' and 'PhysicalActivityLevel'
df['BMI_PhysicalActivity'] = df['BMI'] * df['PhysicalActivityLevel']
fig = px.density_heatmap(df, x='BMI_PhysicalActivity', y='BMI', nbinsx=15, nbinsy=15, color_continuous_scale=px.colors.sequential.Jet)
# Update the layout of the figure
fig.update_layout(template='plotly_dark', width=750, height=750)
fig.show()
In [22]:
fig = px.parallel_categories(df,
dimensions=['PhysicalActivityLevel', 'Age_Category', 'Weight_Category', 'Height_Category', 'ObesityCategory', 'Gender'],
color='BMI',
labels={'PhysicalActivityLevel': 'PhysicalActivityLevel', 'Age_Category': 'Age_Category', 'Weight_Category': 'Weight_Category',
'Height_Category': 'Height_Category', 'ObesityCategory': 'ObesityCategory', 'Gender': 'Gender'},
color_continuous_scale='Rainbow' # Choose your desired color scale
)
fig.update_layout(template='plotly_dark', width=1214, height=750)
fig.show()
In [23]:
fig = px.violin(df, x='Age', y='BMI_PhysicalActivity', color='Age', facet_row='Height_Category')
fig.update_layout(template='plotly_dark', width=1200, height=900)
fig.show()
In [ ]:
In [ ]:
In [ ]: