import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import Image
import http
http.client.HTTPConnection.debuglevel = 1
import json
import requests
Overview about the Analysis and the Dataset¶
Stellar and Celestial objects are always a source of intrigue to human-beings. Asteroids and Meteors often enter the earth's atmosphere but some meteorites end up landing on earth. When celestial objects like meteorites hit the earth, they are examined by space agencies and scientists to determine their chemical composition, properties and even the presence of extraterrestrial samples! The earliest recorded Meteorite landings was the Aegospotami meteorite that fell to the earth around 467 BC. The most recent landing was recorded in Texas as of February 2023. Read more about it here.
In this article, we analyze the 'meteorite landing' public dataset published by NASA and recorded by the Meteoritical Society. The dataset can be downloaded as a .csv' file from the 'Downloads and Resources' section of the website - https://catalog.data.gov/dataset/meteorite-landings. The file that gets downloaded has the name 'Meteorite_Landings.csv'. Below is a snapshot of the dataset -
Data Preparation¶
#read the dataset into a dataframe and rename the columns as needed
Meteorite_Landings = pd.read_csv('Meteorite_Landings.csv')
Meteorite_Landings.columns
Meteorite_Landings.rename({'mass (g)':'mass(g)'},inplace=True)
About GridDB¶
We use GridDB to store the data and to query and visualize the results using SQL queries. GridDB supports 2 types of containers to store data namely a Collection and a Time Series Container. A 'Time Series' container is highly effective for data that has timestamps. Timeseries containers are used in applications that require a time-series analysis whereas a 'Collection' container can be used for data where the objective is to slice and dice data across dimensions, and to calculate metrics and associated visualizations. Read more about container types in this official GridDB resource page.
GridDB supports SQL like querying and all standard SQL functions. To know more about the SQL commands and functions in GridDB, refer to this SQL reference.
Setting up Authorization and Request¶
While connecting to GridDB cloud, the Request that is constructed should have the username and password encoded to base64. To determine the base64 encoded credentials for your username and password, a tool such as https://www.base64encode.org/ can be used. Refer to the resource listed here to learn more about the different entities involved in creating a request - https://griddb.net/en/blog/using-griddb-cloud-version-1-2/. The basic authentication format in GridDB is 'username:password' where the case-sensitive username and password should be used for encoding.
#Construct an object to hold the request headers (ensure that you replace the XXX placeholder with the correct value that matches the credentials for your GridDB instance)
header_obj = {"Authorization":"Basic XXX","Content-Type":"application/json; charset=UTF-8","User-Agent":"PostmanRuntime/7.29.0"}
#Construct the base URL based on your GRIDDB cluster you'd like to connect to (ensure that you replace the placeholders in the URL below with the correct values that correspond to your GridDB instance)
#'https://[host]:[port]/griddb/v2/[clustername]/dbs/[database_name]/'
base_url = 'https://[host]:[port]/griddb/v2/[clustername]/dbs/[database_name]/'
Creating the Container¶
Note that while creating column names, special characters such as '(', '/' should be avoided in column names and container names.
data_obj = {
"container_name": "Meteorite_Landings",
"container_type": "COLLECTION",
"rowkey": False,
"columns": [
{
"name": "name",
"type": "STRING"
},
{
"name": "id",
"type": "INTEGER"
},
{
"name": "nametype",
"type": "STRING"
},
{
"name": "recclass",
"type": "STRING"
},
{
"name": "mass",
"type": "FLOAT"
},
{
"name": "fall",
"type": "STRING"
},
{
"name": "year",
"type": "STRING"
},
{
"name": "reclat",
"type": "FLOAT"
},
{
"name": "reclong",
"type": "FLOAT"
},
{
"name": "GeoLocation",
"type": "STRING"
}
]
}
#Set up the GridDB WebAPI URL
url = base_url + 'containers'
#Invoke the GridDB WebAPI with the headers and the request body
x = requests.post(url, json = data_obj, headers = header_obj)
Loading data into the container¶
The process of registering rows into the container is known as 'Row Registration'. Here is a resource to learn more about the GridDB WebAPI and the process of 'Row Registration' - https://www.toshiba-sol.co.jp/en/pro/griddb/docs-en/v4_6/GridDB_Web_API_Reference.html#row-registration.
The process of loading rows into a container is called 'Row Registration' in GridDB terminology. Here is a resource to learn more about the GridDB WebAPI and the process of 'Row Registration'.
#Convert the data in the dataframe to the JSON format
Meteorite_Landings_json = Meteorite_Landings.to_json(orient='values')
request_body_Meteorite_Landings = Meteorite_Landings_json
#Setup the URL to be used to invoke the GridDB WebAPI to register rows in the container created previously
url = base_url + 'containers/Meteorite_Landings/rows'
#Invoke the GridDB WebAPI using the request constructed
x = requests.put(url, data=request_body_Meteorite_Landings, headers=header_obj)
IOPub data rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_data_rate_limit`. Current values: NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec) NotebookApp.rate_limit_window=3.0 (secs)
reply: 'HTTP/1.1 200 \r\n' header: Date: Fri, 24 Mar 2023 17:13:06 GMT header: Content-Type: application/json;charset=UTF-8 header: Transfer-Encoding: chunked header: Connection: keep-alive header: Server: Apache/2.4.54 (IUS)
A short note on Meteorite Classifications¶
Before we dive into the analysis, here is a short note on recognized classifications of meteorites.
There are subtypes for each of these types. For example, Chondrites are divided into various subtypes such as H,L, etc. based on various physical and chemical characteristics. Read this resource to understand the classification of Chondrites - https://en.wikipedia.org/wiki/Chondrite#Chondrite_classification. Subgroups of Pallasites include PM5,PES,PPX, etc. To know more about pallasites, refer to this resource - https://en.wikipedia.org/wiki/Pallasite. Here is one on Mesosiderites -https://web.archive.org/web/20100510021536/http://www4.nau.edu/meteorite/Meteorite/Mesosiderites.html. Here is a reference to Achondrites - https://en.wikipedia.org/wiki/Achondrite. Let's delve into the dataset and the analysis to learn more.
Data Visualization & Analysis¶
What are the meteorites that fall frequently?¶
GridDB supports most SQL aggregation and Window functions. To know more, refer to this resource.
#construct the sql query to be used
sql_query1 = (f"""SELECT recclass,count(*) FROM Meteorite_Landings GROUP BY 1 HAVING count(*) >500 ORDER BY 2 desc""")
#Setup the URL to be used to invoke the GridDB WebAPI to retrieve data from the container
url = base_url + 'sql'
#Construct the request body
request_body = '[{"type":"sql-select", "stmt":"'+sql_query1+'"}]'
#Validate the constructed request body
request_body
'[{"type":"sql-select", "stmt":"SELECT recclass,count(*) FROM Meteorite_Landings GROUP BY 1 HAVING count(*) >500 ORDER BY 2 desc"}]'
#Invoke the GridDB WebAPI
data_req1 = requests.post(url, data=request_body, headers=header_obj)
#Process the response received and construct a Pandas dataframe with the data from the response
myJson = data_req1.json()
Meteorite_count = pd.DataFrame(myJson[0]["results"], columns=[myJson[0]["columns"][0]["name"], myJson[0]["columns"][1]["name"]])
#rename the columns in the dataframe as applicable
Meteorite_count.rename(columns={'recclass':'Recognized Classification','':'Number of Meteorites'},inplace=True)
#bar chart
fig = px.bar(Meteorite_count, y="Number of Meteorites", x="Recognized Classification",text='Number of Meteorites',title='Meteorites by Recognized Classification') #orientation='h'
fig.update_traces(textposition='inside')
fig.update_layout(yaxis={'categoryorder':'total ascending'}) # add only this line
fig.show()