import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

data = df =  pd.read_csv('Meteorite_Landings.csv') #,  parse_dates=['datetime'])
df.head()

print('This the current state of the data frame BEFORE i start working with it.')
print('The number of rows is',df.shape[0])
print('The number of colums is ',df.shape[1])

This the current state of the data frame BEFORE i start working with it.
The number of rows is 45716
The number of colums is  11

data.describe

<bound method NDFrame.describe of              name     id nametype              recclass  mass (g)   fall  \
0          Aachen      1    Valid                    L5      21.0   Fell   
1          Aarhus      2    Valid                    H6     720.0   Fell   
2            Abee      6    Valid                   EH4  107000.0   Fell   
3        Acapulco     10    Valid           Acapulcoite    1914.0   Fell   
4         Achiras    370    Valid                    L6     780.0   Fell   
...           ...    ...      ...                   ...       ...    ...   
45711  Zillah 002  31356    Valid               Eucrite     172.0  Found   
45712      Zinder  30409    Valid  Pallasite, ungrouped      46.0  Found   
45713        Zlin  30410    Valid                    H4       3.3  Found   
45714   Zubkovsky  31357    Valid                    L6    2167.0  Found   
45715  Zulu Queen  30414    Valid                  L3.7     200.0  Found   

         year    reclat    reclong             GeoLocation  Unnamed: 10  
0      1880.0  50.77500    6.08333       (50.775, 6.08333)          NaN  
1      1951.0  56.18333   10.23333    (56.18333, 10.23333)          NaN  
2      1952.0  54.21667 -113.00000      (54.21667, -113.0)          NaN  
3      1976.0  16.88333  -99.90000       (16.88333, -99.9)          NaN  
4      1902.0 -33.16667  -64.95000     (-33.16667, -64.95)          NaN  
...       ...       ...        ...                     ...          ...  
45711  1990.0  29.03700   17.01850       (29.037, 17.0185)          NaN  
45712  1999.0  13.78333    8.96667     (13.78333, 8.96667)          NaN  
45713  1939.0  49.25000   17.66667       (49.25, 17.66667)          NaN  
45714  2003.0  49.78917   41.50460     (49.78917, 41.5046)          NaN  
45715  1976.0  33.98333 -115.68333  (33.98333, -115.68333)          NaN  

[45716 rows x 11 columns]>

data.describe().T

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
 10  Unnamed: 10  0 non-null      float64
dtypes: float64(5), int64(1), object(5)
memory usage: 3.8+ MB

df.memory_usage()

Index             132
name           365728
id             365728
nametype       365728
recclass       365728
mass (g)       365728
fall           365728
year           365728
reclat         365728
reclong        365728
GeoLocation    365728
Unnamed: 10    365728
dtype: int64

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
 10  Unnamed: 10  0 non-null      float64
dtypes: float64(5), int64(1), object(5)
memory usage: 16.3 MB

df.isnull().sum()

name               0
id                 0
nametype           0
recclass           0
mass (g)         131
fall               0
year             291
reclat          7315
reclong         7315
GeoLocation     7315
Unnamed: 10    45716
dtype: int64

df.nunique()

name           45716
id             45716
nametype           2
recclass         466
mass (g)       12576
fall               2
year             265
reclat         12738
reclong        14640
GeoLocation    17100
Unnamed: 10        0
dtype: int64

df.isnull().sum().any()

True

categorical_columns = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    print(f"Frequency table for {col}:\n")
    print(df[col].value_counts())
    print("\n")

Frequency table for name:

name
Aachen                    1
Northwest Africa 7459     1
Northwest Africa 7404     1
Northwest Africa 7407     1
Northwest Africa 7408     1
                         ..
Grove Mountains 052250    1
Grove Mountains 052253    1
Grove Mountains 052254    1
Grove Mountains 052256    1
Zulu Queen                1
Name: count, Length: 45716, dtype: int64


Frequency table for nametype:

nametype
Valid     45641
Relict       75
Name: count, dtype: int64


Frequency table for recclass:

recclass
L6         8285
H5         7142
L5         4796
H6         4528
H4         4211
           ... 
EL7           1
CH/CBb        1
H/L~4         1
LL3.7-6       1
L/LL          1
Name: count, Length: 466, dtype: int64


Frequency table for fall:

fall
Found    44609
Fell      1107
Name: count, dtype: int64


Frequency table for GeoLocation:

GeoLocation
(0.0, 0.0)                6214
(-71.5, 35.66667)         4761
(-84.0, 168.0)            3040
(-72.0, 26.0)             1505
(-79.68333, 159.75)        657
                          ... 
(-76.30361, 157.17611)       1
(-76.28611, 157.23972)       1
(-76.31889, 157.265)         1
(-76.28722, 157.19333)       1
(33.98333, -115.68333)       1
Name: count, Length: 17100, dtype: int64

# Identify categorical and numeric columns
cat_cols = data.select_dtypes(include=['object', 'category']).columns
numeric_cols = data.select_dtypes(include=['number']).columns
    
print(f"Categorical columns: {cat_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: Index(['name', 'nametype', 'recclass', 'fall', 'GeoLocation'], dtype='object')
Numeric columns: Index(['id', 'mass (g)', 'year', 'reclat', 'reclong', 'Unnamed: 10'], dtype='object')

# Converting the data type of each categorical variable to 'category'
for column in cat_cols:
    data[column]=data[column].astype('category')

data.info

<bound method DataFrame.info of              name     id nametype              recclass  mass (g)   fall  \
0          Aachen      1    Valid                    L5      21.0   Fell   
1          Aarhus      2    Valid                    H6     720.0   Fell   
2            Abee      6    Valid                   EH4  107000.0   Fell   
3        Acapulco     10    Valid           Acapulcoite    1914.0   Fell   
4         Achiras    370    Valid                    L6     780.0   Fell   
...           ...    ...      ...                   ...       ...    ...   
45711  Zillah 002  31356    Valid               Eucrite     172.0  Found   
45712      Zinder  30409    Valid  Pallasite, ungrouped      46.0  Found   
45713        Zlin  30410    Valid                    H4       3.3  Found   
45714   Zubkovsky  31357    Valid                    L6    2167.0  Found   
45715  Zulu Queen  30414    Valid                  L3.7     200.0  Found   

         year    reclat    reclong             GeoLocation  Unnamed: 10  
0      1880.0  50.77500    6.08333       (50.775, 6.08333)          NaN  
1      1951.0  56.18333   10.23333    (56.18333, 10.23333)          NaN  
2      1952.0  54.21667 -113.00000      (54.21667, -113.0)          NaN  
3      1976.0  16.88333  -99.90000       (16.88333, -99.9)          NaN  
4      1902.0 -33.16667  -64.95000     (-33.16667, -64.95)          NaN  
...       ...       ...        ...                     ...          ...  
45711  1990.0  29.03700   17.01850       (29.037, 17.0185)          NaN  
45712  1999.0  13.78333    8.96667     (13.78333, 8.96667)          NaN  
45713  1939.0  49.25000   17.66667       (49.25, 17.66667)          NaN  
45714  2003.0  49.78917   41.50460     (49.78917, 41.5046)          NaN  
45715  1976.0  33.98333 -115.68333  (33.98333, -115.68333)          NaN  

[45716 rows x 11 columns]>

data.dtypes

name           category
id                int64
nametype       category
recclass       category
mass (g)        float64
fall           category
year            float64
reclat          float64
reclong         float64
GeoLocation    category
Unnamed: 10     float64
dtype: object

data.head(2)

df.isna().values.any()

True

df.isnull().sum().sum()

68083

meteorite_data = df

# Step 1: Drop the unnecessary column
meteorite_data.drop(columns=['Unnamed: 10'], inplace=True)

# Step 2: Define function to classify hemisphere based on coordinates
def classify_hemisphere(lat, lon):
    lat_hemisphere = 'Northern' if lat >= 0 else 'Southern'
    lon_hemisphere = 'Eastern' if lon >= 0 else 'Western'
    return f"{lat_hemisphere} & {lon_hemisphere}"

# Step 3: Apply the hemisphere classification
meteorite_data['Hemisphere'] = meteorite_data.apply(
    lambda row: classify_hemisphere(row['reclat'], row['reclong']) if not pd.isnull(row['reclat']) and not pd.isnull(row['reclong']) else 'Unknown',
    axis=1
)

# Step 4: Preview the updated DataFrame
meteorite_data[['name', 'reclat', 'reclong', 'Hemisphere']].head()

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

# Step 2: Define function to classify hemisphere based on coordinates
def classify_hemisphere(lat, lon):
    lat_hemisphere = 'Northern' if lat >= 0 else 'Southern'
    lon_hemisphere = 'Eastern' if lon >= 0 else 'Western'
    return f"{lat_hemisphere} & {lon_hemisphere}"

# Step 3: Apply the hemisphere classification
meteorite_data['Hemisphere'] = meteorite_data.apply(
    lambda row: classify_hemisphere(row['reclat'], row['reclong']) if not pd.isnull(row['reclat']) and not pd.isnull(row['reclong']) else 'Unknown',
    axis=1
)

# Step 4: Load a world boundaries dataset from the local shapefile
world = gpd.read_file('ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')

# Step 5: Create geometry column with points from latitude and longitude
meteorite_data['geometry'] = meteorite_data.apply(
    lambda row: Point(row['reclong'], row['reclat']) if not pd.isnull(row['reclat']) and not pd.isnull(row['reclong']) else None,
    axis=1
)

# Step 6: Convert meteorite data to a GeoDataFrame
meteorite_gdf = gpd.GeoDataFrame(meteorite_data, geometry='geometry', crs='EPSG:4326')

# Step 7: Perform a spatial join to map meteorites to countries
meteorite_with_country = gpd.sjoin(meteorite_gdf, world, how='left', predicate='intersects')

# Step 8: Add the correct country name column to the DataFrame
meteorite_data['Country'] = meteorite_with_country['ADMIN']  # Use the 'ADMIN' column for country names

# Step 9: Preview the updated DataFrame
meteorite_data[['name', 'reclat', 'reclong', 'Hemisphere', 'Country']].head()

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Calculate the distribution of meteorites by country (placeholder example)
# This step assumes that 'Country' column is available after prior steps; adjust as needed
# For now, using the 'name' field to simulate data exploration
country_distribution = meteorite_data['name'].value_counts().head(10)

# Step 5: Calculate the distribution of meteorites by hemisphere
hemisphere_distribution = meteorite_data['reclat'].apply(lambda lat: 'Northern' if lat >= 0 else 'Southern').value_counts()

# Step 6: Display the results
print("\nStatistical Summary:\n", statistical_summary)
print("\nTop 10 Meteorite Names:\n", country_distribution)
print("\nHemisphere Distribution:\n", hemisphere_distribution)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

Top 10 Meteorite Names:
 name
Aachen                   1
Northwest Africa 7459    1
Northwest Africa 7404    1
Northwest Africa 7407    1
Northwest Africa 7408    1
Northwest Africa 741     1
Northwest Africa 7410    1
Northwest Africa 7412    1
Northwest Africa 7414    1
Northwest Africa 742     1
Name: count, dtype: int64

Hemisphere Distribution:
 reclat
Southern    30728
Northern    14988
Name: count, dtype: int64

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Calculate the distribution of meteorites by hemisphere
hemisphere_distribution = meteorite_data['reclat'].apply(lambda lat: 'Northern' if lat >= 0 else 'Southern').value_counts()

# Step 5: Handle potential data issues (e.g., year outliers)
# Filter out years beyond a reasonable range (e.g., after 2025)
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Step 6: Visualize mass distribution
plt.figure(figsize=(10, 6))
sns.histplot(meteorite_data['mass (g)'], bins=50, kde=True, color='blue')
plt.title('Distribution of Meteorite Mass')
plt.xlabel('Mass (g)')
plt.ylabel('Frequency')
plt.show()

# Step 7: Visualize meteorite landings by year
plt.figure(figsize=(10, 6))
sns.histplot(meteorite_data['year'], bins=50, color='green')
plt.title('Meteorite Landings Over Time')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

# Step 8: Visualize hemisphere distribution
plt.figure(figsize=(6, 6))
hemisphere_distribution.plot(kind='bar', color='orange')
plt.title('Meteorite Landings by Hemisphere')
plt.xlabel('Hemisphere')
plt.ylabel('Count')
plt.show()

# Step 9: Display the results
print("\nStatistical Summary:\n", statistical_summary)
print("\nHemisphere Distribution:\n", hemisphere_distribution)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

Hemisphere Distribution:
 reclat
Southern    30728
Northern    14988
Name: count, dtype: int64

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Calculate the distribution of meteorites by hemisphere
hemisphere_distribution = meteorite_data['reclat'].apply(lambda lat: 'Northern' if lat >= 0 else 'Southern').value_counts()

# Step 5: Handle potential data issues (e.g., year outliers)
# Filter out years beyond a reasonable range (e.g., after 2025)
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Step 6: Visualize mass distribution
plt.figure(figsize=(10, 6))
sns.histplot(meteorite_data['mass (g)'], bins=50, kde=True, color='blue')
plt.title('Distribution of Meteorite Mass')
plt.xlabel('Mass (g)')
plt.ylabel('Frequency')
plt.show()

# Step 7: Display statistical summary again (one visualization at a time)
print("\nStatistical Summary:\n", statistical_summary)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Calculate the distribution of meteorites by hemisphere
hemisphere_distribution = meteorite_data['reclat'].apply(lambda lat: 'Northern' if lat >= 0 else 'Southern').value_counts()

# Step 5: Handle potential data issues (e.g., year outliers)
# Filter out years beyond a reasonable range (e.g., after 2025)
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Step 6: Visualize meteorite landings over time
plt.figure(figsize=(10, 6))
sns.histplot(meteorite_data['year'], bins=50, color='green')
plt.title('Meteorite Landings Over Time')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

# Step 7: Display statistical summary again (next visualization step)
print("\nStatistical Summary:\n", statistical_summary)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Calculate the distribution of meteorites by hemisphere
hemisphere_distribution = meteorite_data['reclat'].apply(lambda lat: 'Northern' if lat >= 0 else 'Southern').value_counts()

# Step 5: Handle potential data issues (e.g., year outliers)
# Filter out years beyond a reasonable range (e.g., after 2025)
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Step 6: Visualize hemisphere distribution
plt.figure(figsize=(6, 6))
hemisphere_distribution.plot(kind='bar', color='orange')
plt.title('Meteorite Landings by Hemisphere')
plt.xlabel('Hemisphere')
plt.ylabel('Count')
plt.show()

# Step 7: Display statistical summary again (next visualization step)
print("\nStatistical Summary:\n", statistical_summary)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Perform basic statistical analysis on relevant numeric columns
statistical_summary = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].describe()

# Step 4: Add the country column using spatial join
# Load the world boundaries dataset from the shapefile
world = gpd.read_file('ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp')

# Create geometry column with points from latitude and longitude
meteorite_data['geometry'] = meteorite_data.apply(
    lambda row: Point(row['reclong'], row['reclat']) if not pd.isnull(row['reclat']) and not pd.isnull(row['reclong']) else None,
    axis=1
)

# Convert to a GeoDataFrame
meteorite_gdf = gpd.GeoDataFrame(meteorite_data, geometry='geometry', crs='EPSG:4326')

# Perform spatial join to map meteorites to countries
meteorite_with_country = gpd.sjoin(meteorite_gdf, world, how='left', predicate='intersects')

# Add the country name column
meteorite_data['Country'] = meteorite_with_country['ADMIN']

# Step 5: Filter out years beyond a reasonable range (e.g., after 2025)
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Step 6: Visualize the top 10 countries by number of meteorite landings
top_countries = meteorite_data['Country'].value_counts().head(10)
plt.figure(figsize=(10, 6))
top_countries.plot(kind='bar', color='purple')
plt.title('Top 10 Countries by Meteorite Landings')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Step 7: Display statistical summary again
print("\nStatistical Summary:\n", statistical_summary)

Statistical Summary:
            mass (g)          year        reclat       reclong
count  4.558500e+04  45425.000000  38401.000000  38401.000000
mean   1.327808e+04   1991.828817    -39.122580     61.074319
std    5.749889e+05     25.052766     46.378511     80.647298
min    0.000000e+00    860.000000    -87.366670   -165.433330
25%    7.200000e+00   1987.000000    -76.714240      0.000000
50%    3.260000e+01   1998.000000    -71.500000     35.666670
75%    2.026000e+02   2003.000000      0.000000    157.166670
max    6.000000e+07   2101.000000     81.166670    354.473330

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for regression analysis
# Remove rows with missing values in relevant columns
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to handle large values
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Step 4: Define features and target variable
X = meteorite_data[['year', 'reclat', 'reclong']]
y = meteorite_data['log_mass']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train a linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Step 7: Make predictions and evaluate the model
y_pred = regressor.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Step 8: Display results
print("\nRegression Model Results:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

# Step 9: Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.title('Actual vs Predicted Log(Mass)')
plt.xlabel('Actual Log(Mass)')
plt.ylabel('Predicted Log(Mass)')
plt.show()

Regression Model Results:
Mean Squared Error: 3.87
R² Score: 0.32

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for regression analysis
# Remove rows with missing values in relevant columns
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong', 'fall', 'recclass']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Encode categorical variables
meteorite_data = pd.get_dummies(meteorite_data, columns=['fall', 'recclass'], drop_first=True)

# Log-transform mass to handle large values
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Step 4: Define features and target variable
X = meteorite_data.drop(columns=['mass (g)', 'log_mass'])
y = meteorite_data['log_mass']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train and evaluate Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

# Step 7: Train and evaluate Gradient Boosting model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

# Step 8: Display results
print("\nRandom Forest Results:")
print(f"Mean Squared Error: {rf_mse:.2f}")
print(f"R² Score: {rf_r2:.2f}")

print("\nGradient Boosting Results:")
print(f"Mean Squared Error: {gb_mse:.2f}")
print(f"R² Score: {gb_r2:.2f}")

# Step 9: Plot actual vs predicted values for both models
plt.figure(figsize=(14, 6))

# Random Forest plot
plt.subplot(1, 2, 1)
plt.scatter(y_test, rf_pred, alpha=0.7, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.title('Random Forest: Actual vs Predicted Log(Mass)')
plt.xlabel('Actual Log(Mass)')
plt.ylabel('Predicted Log(Mass)')

# Gradient Boosting plot
plt.subplot(1, 2, 2)
plt.scatter(y_test, gb_pred, alpha=0.7, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.title('Gradient Boosting: Actual vs Predicted Log(Mass)')
plt.xlabel('Actual Log(Mass)')
plt.ylabel('Predicted Log(Mass)')

plt.tight_layout()
plt.show()

Random Forest Results:
Mean Squared Error: 2.35
R² Score: 0.59

Gradient Boosting Results:
Mean Squared Error: 2.83
R² Score: 0.50

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for regression analysis
# Remove rows with missing values in relevant columns
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to handle large values
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Step 4: Define features and target variable
X = meteorite_data[['year', 'reclat', 'reclong']]
y = meteorite_data['log_mass']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(X_train, y_train)

# Step 7: Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, cv=5, scoring='r2')
gb_grid_search.fit(X_train, y_train)

# Step 8: Evaluate the best models
rf_best_model = rf_grid_search.best_estimator_
gb_best_model = gb_grid_search.best_estimator_

rf_pred = rf_best_model.predict(X_test)
gb_pred = gb_best_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

gb_mse = mean_squared_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

# Step 9: Display results
print("\nBest Random Forest Model:", rf_grid_search.best_params_)
print(f"Random Forest MSE: {rf_mse:.2f}, R²: {rf_r2:.2f}")

print("\nBest Gradient Boosting Model:", gb_grid_search.best_params_)
print(f"Gradient Boosting MSE: {gb_mse:.2f}, R²: {gb_r2:.2f}")

# Step 10: Plot actual vs predicted values for both models
plt.figure(figsize=(14, 6))

# Random Forest plot
plt.subplot(1, 2, 1)
plt.scatter(y_test, rf_pred, alpha=0.7, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', lw=2, color='red')
plt.title('Tuned Random Forest: Actual vs Predicted Log(Mass)')
plt.xlabel('Actual Log(Mass)')
plt.ylabel('Predicted Log(Mass)')

# Gradient Boosting plot
plt.subplot(1, 2, 2)
plt.scatter(y_test, gb_pred, alpha=0.7, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', lw=2, color='red')
plt.title('Tuned Gradient Boosting: Actual vs Predicted Log(Mass)')
plt.xlabel('Actual Log(Mass)')
plt.ylabel('Predicted Log(Mass)')

plt.tight_layout()
plt.show()

Best Random Forest Model: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest MSE: 2.51, R²: 0.56

Best Gradient Boosting Model: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Gradient Boosting MSE: 2.60, R²: 0.54

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for regression analysis
# Remove rows with missing values in relevant columns
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong', 'fall', 'recclass']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Encode categorical variables
meteorite_data = pd.get_dummies(meteorite_data, columns=['fall', 'recclass'], drop_first=True)

# Log-transform mass to handle large values
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Step 4: Define features and target variable
X = meteorite_data.drop(columns=['mass (g)', 'log_mass'])
y = meteorite_data['log_mass']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Train the best Random Forest model based on hyperparameter tuning
rf_best_model = RandomForestRegressor(max_depth=20, min_samples_split=5, n_estimators=200, random_state=42)
rf_best_model.fit(X_train, y_train)

# Step 7: Analyze feature importance
feature_importances = pd.Series(rf_best_model.feature_importances_, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)

# Step 8: Plot feature importance
plt.figure(figsize=(12, 6))
feature_importances.head(20).plot(kind='bar', color='skyblue')
plt.title('Top 20 Feature Importances')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

# Step 9: Display top features
print("\nTop Features by Importance:\n")
print(feature_importances.head(20))

Top Features by Importance:

reclat                   0.461816
year                     0.236572
reclong                  0.173123
recclass_L6              0.009002
fall_Found               0.008168
recclass_Iron, IIIAB     0.007450
recclass_H6              0.007267
recclass_H5              0.007217
recclass_L5              0.006762
recclass_Iron, IIAB      0.005435
recclass_E3              0.004398
recclass_H4              0.004230
recclass_LL5             0.003440
recclass_LL6             0.003292
recclass_Relict OC       0.003062
recclass_L4              0.002872
recclass_Iron, IAB-MG    0.002731
recclass_CM2             0.002502
recclass_CO3             0.002388
recclass_Iron            0.002030
dtype: float64

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data
# Remove rows with missing values in relevant columns
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to reduce skewness
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Step 4: Visualize relationship between top features and mass
# 1. Latitude vs Log(Mass)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclat', y='log_mass', data=meteorite_data, alpha=0.6)
plt.title('Latitude vs Log(Mass)')
plt.xlabel('Latitude')
plt.ylabel('Log(Mass)')
plt.show()

# 2. Year vs Log(Mass)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='year', y='log_mass', data=meteorite_data, alpha=0.6, color='green')
plt.title('Year vs Log(Mass)')
plt.xlabel('Year')
plt.ylabel('Log(Mass)')
plt.show()

# 3. Longitude vs Log(Mass)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclong', y='log_mass', data=meteorite_data, alpha=0.6, color='orange')
plt.title('Longitude vs Log(Mass)')
plt.xlabel('Longitude')
plt.ylabel('Log(Mass)')
plt.show()

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for clustering
# Remove rows with missing values
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to reduce skewness
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Select features for clustering
X_clustering = meteorite_data[['log_mass', 'reclat', 'reclong']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Step 4: Apply K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
meteorite_data['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# Step 5: Apply DBSCAN clustering
dbscan = DBSCAN(eps=1.5, min_samples=10)
meteorite_data['dbscan_cluster'] = dbscan.fit_predict(X_scaled)

# Step 6: Visualize K-Means clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclong', y='reclat', hue='kmeans_cluster', data=meteorite_data, palette='viridis', alpha=0.7)
plt.title('K-Means Clustering of Meteorites')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Cluster')
plt.show()

# Step 7: Visualize DBSCAN clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclong', y='reclat', hue='dbscan_cluster', data=meteorite_data, palette='deep', alpha=0.7)
plt.title('DBSCAN Clustering of Meteorites')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Cluster')
plt.show()

# Step 8: Compare cluster results
print("\nK-Means Cluster Distribution:\n")
print(meteorite_data['kmeans_cluster'].value_counts())

print("\nDBSCAN Cluster Distribution (including noise points):\n")
print(meteorite_data['dbscan_cluster'].value_counts())

K-Means Cluster Distribution:

kmeans_cluster
0    12358
3    10057
1     9054
2     6645
Name: count, dtype: int64

DBSCAN Cluster Distribution (including noise points):

dbscan_cluster
0    38114
Name: count, dtype: int64

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for clustering
# Remove rows with missing values
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong', 'recclass']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to reduce skewness
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Select features for clustering
X_clustering = meteorite_data[['log_mass', 'reclat', 'reclong']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Step 4: Apply K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
meteorite_data['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# Step 5: Explore regional insights by clusters
# 1. Average mass by cluster
cluster_mass_avg = meteorite_data.groupby('kmeans_cluster')['log_mass'].mean()
print("\nAverage Log(Mass) by K-Means Cluster:\n", cluster_mass_avg)

# 2. Top meteorite classes by cluster
cluster_classes = meteorite_data.groupby('kmeans_cluster')['recclass'].value_counts(normalize=True).unstack().fillna(0)

# Step 6: Plot top classes for each cluster
plt.figure(figsize=(12, 6))
sns.heatmap(cluster_classes, cmap='Blues', cbar=True)
plt.title('Top Meteorite Classes by K-Means Cluster')
plt.xlabel('Meteorite Class')
plt.ylabel('Cluster')
plt.show()

# Step 7: Visualize mass distribution by cluster
plt.figure(figsize=(10, 6))
sns.boxplot(x='kmeans_cluster', y='log_mass', data=meteorite_data, palette='muted')
plt.title('Log(Mass) Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Log(Mass)')
plt.show()

Average Log(Mass) by K-Means Cluster:
 kmeans_cluster
0    3.040139
1    3.397249
2    7.663111
3    2.710099
Name: log_mass, dtype: float64

/var/folders/q0/xfs5xjxx50xdjh4tzn1psdnw0000gn/T/ipykernel_41242/2049878150.py:48: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='kmeans_cluster', y='log_mass', data=meteorite_data, palette='muted')

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for clustering
# Remove rows with missing values
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong', 'recclass', 'name']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to reduce skewness
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Select features for clustering
X_clustering = meteorite_data[['log_mass', 'reclat', 'reclong']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Step 4: Apply K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
meteorite_data['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# Step 5: Investigate cluster details
# 1. Identify the regions or names associated with larger meteorites in Cluster 2
cluster_2_data = meteorite_data[meteorite_data['kmeans_cluster'] == 2]
top_meteorites_cluster_2 = cluster_2_data.sort_values(by='log_mass', ascending=False).head(10)

print("\nTop 10 Largest Meteorites in Cluster 2:\n")
print(top_meteorites_cluster_2[['name', 'mass (g)', 'reclat', 'reclong', 'recclass']])

# 2. Visualize the geographic spread of meteorites in Cluster 2
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclong', y='reclat', size='log_mass', hue='recclass', data=cluster_2_data, alpha=0.7, legend=False, sizes=(40, 400))
plt.title('Geographic Spread of Large Meteorites in Cluster 2')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

Top 10 Largest Meteorites in Cluster 2:

                  name    mass (g)    reclat    reclong         recclass
16392             Hoba  60000000.0 -19.58333   17.91667        Iron, IVB
5373         Cape York  58200000.0  76.13333  -64.93333      Iron, IIIAB
5365   Campo del Cielo  50000000.0 -27.46667  -60.58333     Iron, IAB-MG
5370     Canyon Diablo  30000000.0  35.05000 -111.03333     Iron, IAB-MG
3455           Armanty  28000000.0  47.00000   88.00000       Iron, IIIE
12613           Gibeon  26000000.0 -25.50000   18.00000        Iron, IVA
5468        Chupaderos  24300000.0  27.00000 -105.10000      Iron, IIIAB
26297      Mundrabilla  24000000.0 -30.78333  127.55000    Iron, IAB-ung
920       Sikhote-Alin  23000000.0  46.16000  134.65333       Iron, IIAB
5016        Bacubirito  22000000.0  26.20000 -107.83333  Iron, ungrouped

# Step 1: Import necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 2: Load the dataset
meteorite_data = pd.read_csv('Meteorite_Landings.csv')

# Step 3: Clean and prepare data for clustering
# Remove rows with missing values
meteorite_data = meteorite_data[['mass (g)', 'year', 'reclat', 'reclong', 'recclass', 'name']].dropna()
meteorite_data = meteorite_data[meteorite_data['year'] <= 2025]

# Log-transform mass to reduce skewness
meteorite_data['log_mass'] = np.log1p(meteorite_data['mass (g)'])

# Select features for clustering
X_clustering = meteorite_data[['log_mass', 'reclat', 'reclong']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Step 4: Apply K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
meteorite_data['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# Step 5: Investigate another cluster (e.g., Cluster 1)
cluster_1_data = meteorite_data[meteorite_data['kmeans_cluster'] == 1]
top_meteorites_cluster_1 = cluster_1_data.sort_values(by='log_mass', ascending=False).head(10)

print("\nTop 10 Largest Meteorites in Cluster 1:\n")
print(top_meteorites_cluster_1[['name', 'mass (g)', 'reclat', 'reclong', 'recclass']])

# Step 6: Visualize the geographic spread of meteorites in Cluster 1
plt.figure(figsize=(10, 6))
sns.scatterplot(x='reclong', y='reclat', size='log_mass', hue='recclass', data=cluster_1_data, alpha=0.7, legend=False, sizes=(40, 400))
plt.title('Geographic Spread of Meteorites in Cluster 1')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Step 7: Display summary statistics for Cluster 1
print("\nSummary Statistics for Cluster 1:\n")
print(cluster_1_data[['log_mass', 'reclat', 'reclong']].describe())

Top 10 Largest Meteorites in Cluster 1:

                  name  mass (g)    reclat    reclong   recclass
9274     El Médano 045     319.0 -24.85000 -70.533330         H5
11751          Fortuna     312.0 -35.13333 -65.366670  Winonaite
5421      Catalina 024     312.0 -25.23333 -69.716670         H4
21705  Los Vientos 018     308.0 -24.68333 -69.766670         L6
36771     San Juan 026     307.0 -25.44250 -69.871167         L6
21701  Los Vientos 011     300.0 -24.68333 -69.766670         L6
845              Renca     300.0 -32.75000 -65.283330         L5
9377     El Médano 148     298.0 -24.85000 -70.533330         H5
9318     El Médano 089     296.0 -24.85000 -70.533330         L6
9321     El Médano 092     289.0 -24.85000 -70.533330         H6

Summary Statistics for Cluster 1:

          log_mass       reclat      reclong
count  9054.000000  9054.000000  9054.000000
mean      3.397249     8.752148     4.031202
std       1.370155    14.494240    38.659086
min       0.000000   -35.133330  -137.700000
25%       2.302585     0.000000     0.000000
50%       3.526361     0.000000     0.000000
75%       4.583563    19.978877    16.139122
max       5.768321    66.138890   174.500430

Python Code Public Examples¶

Juan D. Correa - Software Developher @ Veracruz,MX & California,USA¶

https://www.astropema.com/ astropema@gmail.com¶

Overview of the raw Meteorite Data Set:¶

name: Name of the meteorite.¶

id: Unique identifier.¶

nametype: Indicates if the name is valid.¶

recclass: Classification of the meteorite.¶

mass (g): Mass of the meteorite in grams.¶

fall: Indicates if the meteorite was observed falling or found later.¶

year: Year of the fall/discovery.¶

reclat and reclong: Latitude and longitude of the meteorite landing.¶

GeoLocation: Combined coordinates (latitude, longitude).¶

Unnamed: 10: Appears to be an empty column.¶

Data Engineerirng and Exploration¶

¶

¶

¶

¶

¶

Model Tarining¶

¶

¶

¶

¶

¶

¶

¶

¶

added code to explore regional insights for each cluster, focusing on:¶

¶

¶

¶

¶

	name	id	nametype	recclass	mass (g)	fall	year	reclat	reclong	GeoLocation	Unnamed: 10
0	Aachen	1	Valid	L5	21.0	Fell	1880.0	50.77500	6.08333	(50.775, 6.08333)	NaN
1	Aarhus	2	Valid	H6	720.0	Fell	1951.0	56.18333	10.23333	(56.18333, 10.23333)	NaN
2	Abee	6	Valid	EH4	107000.0	Fell	1952.0	54.21667	-113.00000	(54.21667, -113.0)	NaN
3	Acapulco	10	Valid	Acapulcoite	1914.0	Fell	1976.0	16.88333	-99.90000	(16.88333, -99.9)	NaN
4	Achiras	370	Valid	L6	780.0	Fell	1902.0	-33.16667	-64.95000	(-33.16667, -64.95)	NaN

	count	mean	std	min	25%	50%	75%	max
id	45716.0	26889.735104	16860.683030	1.00000	12688.75000	24261.50000	40656.75000	5.745800e+04
mass (g)	45585.0	13278.078549	574988.876410	0.00000	7.20000	32.60000	202.60000	6.000000e+07
year	45425.0	1991.828817	25.052766	860.00000	1987.00000	1998.00000	2003.00000	2.101000e+03
reclat	38401.0	-39.122580	46.378511	-87.36667	-76.71424	-71.50000	0.00000	8.116667e+01
reclong	38401.0	61.074319	80.647298	-165.43333	0.00000	35.66667	157.16667	3.544733e+02
Unnamed: 10	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN