FeatureBaseDB · kordless · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/data/spotify/insert_artists.sql b/data/spotify/insert_artists.sql
@@ -0,0 +1,27 @@
+-- Create table artists
+CREATE TABLE artists (
+    _id STRING,
+    artist_popularity int,
+    artist_genres STRINGSET,
+    artist_followers int
+);
+
+-- Url of the dataset: https://transfer.sh/qQDFEMJ3ij/artists.csv
+-- Insert data into artists with a bulk insert by ingesting a csv file from an url
+
+BULK INSERT
+INTO artists (_id, artist_popularity, artist_genres, artist_followers)
+MAP(0 STRING,
+1 int,
+2 STRINGSET,
+3 int)
+FROM
+    'https://transfer.sh/qQDFEMJ3ij/artists.csv'
+WITH
+    BATCHSIZE 100000
+    FORMAT 'CSV'
+    INPUT 'URL'
+    HEADER_ROW;
+
+-- Make sure the data is there by selecting the first row
+SELECT * FROM artists LIMIT 1;
diff --git a/data/spotify/insert_playlists.sql b/data/spotify/insert_playlists.sql
@@ -0,0 +1,35 @@
+-- Create table playlists
+CREATE TABLE playlists (
+    _id id,
+    uri STRING,
+    name STRING,
+    description STRING,
+    spotify_query STRING,
+    author STRING,
+    n_tracks int,
+    playlist_followers int
+);
+
+-- Url of the dataset: https://transfer.sh/csCuCE2FTO/final_playlists.csv
+-- Insert data into playlists with a bulk insert by ingesting a csv file from an url
+
+BULK INSERT
+INTO playlists (_id, uri, name, description, spotify_query, author, n_tracks, playlist_followers)
+MAP(0 id,
+1 STRING,
+2 STRING,
+3 STRING,
+4 STRING,
+5 STRING,
+6 int,
+7 int)
+FROM
+    'https://transfer.sh/csCuCE2FTO/final_playlists.csv'
+WITH
+    BATCHSIZE 100000
+    FORMAT 'CSV'
+    INPUT 'URL'
+    HEADER_ROW;
+
+-- Make sure the data is there by selecting the first row
+SELECT * FROM playlists LIMIT 1;
diff --git a/data/spotify/insert_tracks.sql b/data/spotify/insert_tracks.sql
@@ -0,0 +1,40 @@
+-- Create table tracks
+CREATE TABLE tracks (
+    _id id,
+    name STRING,
+    artists_names STRINGSET,
+    track_uri STRING,
+    popularity int,
+    album_type STRING,
+    is_playable bool,
+    release_date STRING,
+    artists_uris STRINGSET,
+    playlist_uris STRINGSET
+);
+
+
+-- Url of the dataset: https://transfer.sh/uEVnKIdpnx/final_tracks.csv
+-- Insert data into tracks with a bulk insert by ingesting a csv file from an url
+
+BULK INSERT
+INTO tracks (_id, name, artists_names, track_uri, popularity, album_type, is_playable, release_date, artists_uris, playlist_uris)
+MAP(0 id,
+1 STRING,
+2 STRINGSET,
+3 STRING,
+4 int,
+5 STRING,
+6 bool,
+7 STRING,
+8 STRINGSET,
+9 STRINGSET)
+FROM
+    'https://transfer.sh/uEVnKIdpnx/final_tracks.csv'
+WITH
+    BATCHSIZE 100000
+    FORMAT 'CSV'
+    INPUT 'URL'
+    HEADER_ROW;
+
+-- Make sure the data is there by selecting the first row
+SELECT * FROM tracks LIMIT 1;
diff --git a/data/spotify/queries.sql b/data/spotify/queries.sql
@@ -0,0 +1,174 @@
+-- This file contains some SQL queries that you can use to explore the data with fbsql.
+
+-- Find the most popular artists:
+SELECT * FROM artists
+ORDER BY artist_popularity DESC
+LIMIT 10;
+
+-- Find the most popular tracks:
+SELECT * FROM tracks
+ORDER BY popularity DESC
+LIMIT 10;
+
+-- Find the most popular playlists:
+SELECT * FROM playlists
+ORDER BY playlist_followers DESC
+LIMIT 10;
+
+-- Find the most popular playlists with more than 100 tracks:
+SELECT * FROM playlists
+WHERE n_tracks > 100
+ORDER BY playlist_followers DESC
+LIMIT 10;
+
+-- Find playlists with more than 100 tracks and sort them by the number of followers in descending order:
+SELECT * FROM playlists
+WHERE n_tracks > 100
+ORDER BY playlist_followers DESC;
+
+-- Find the albums released in a specific year along with the total number of tracks in each album:
+SELECT release_date, album_type, COUNT(*) AS track_count
+FROM tracks
+WHERE SUBSTRING(release_date, 1, 4) = '2022' -- Replace '2022' with the desired year
+GROUP BY release_date, album_type;
+
+-- Find comprehensive details of playlists, tracks, and associated artists by performing a join operation on the respective tables.
+SELECT
+    p.name AS playlist_name,
+    p.description AS playlist_description,
+    p.n_tracks AS playlist_tracks,
+    p.playlist_followers AS playlist_followers,
+    t.name AS track_name,
+    t.popularity AS track_popularity,
+    t.release_date AS track_release_date,
+    a.artist_popularity AS artist_popularity,
+    a.artist_followers AS artist_followers
+FROM
+    playlists p
+    JOIN tracks t ON SETCONTAINS(t.playlist_uris, p.uri)
+    JOIN artists a ON SETCONTAINS(t.artists_uris, a._id);
+
+-- Find the average track and artist popularity, as well as the count of distinct tracks and artists, for each playlist.
+SELECT
+    p.name AS playlist_name,
+    AVG(t.popularity) AS average_track_popularity,
+    AVG(a.artist_popularity) AS average_artist_popularity,
+    COUNT(DISTINCT t.name) AS distinct_track_count,
+    COUNT(DISTINCT a._id) AS distinct_artist_count
+FROM
+    playlists p
+    JOIN tracks t ON SETCONTAINS(t.playlist_uris, p.uri)
+    JOIN artists a ON SETCONTAINS(t.artists_uris, a._id)
+GROUP BY
+    p.name;
+
+-- Find the most popular artists by the average popularity of their tracks across all playlists.
+SELECT
+    a._id AS artist_id,
+    a.artist_popularity AS artist_popularity,
+    COUNT(DISTINCT p._id) AS playlist_count,
+    AVG(t.popularity) AS average_track_popularity
+FROM
+    artists a
+    JOIN tracks t ON SETCONTAINS(t.artists_uris, a._id)
+    JOIN playlists p ON SETCONTAINS(t.playlist_uris, p.uri)
+GROUP BY
+    a._id, a.artist_popularity
+HAVING
+    COUNT(DISTINCT p._id) > 5
+ORDER BY
+    average_track_popularity DESC
+LIMIT 10;
+
+-- Find the most popular artists by the total number of followers across all playlists.
+SELECT
+    p.name AS playlist_name,
+    t.name AS track_name,
+    a._id AS artist_id,
+    COUNT(DISTINCT t._id) AS track_count,
+    SUM(a.artist_followers) AS total_artist_followers
+FROM
+    playlists p
+    JOIN tracks t ON SETCONTAINS(t.playlist_uris, p.uri)
+    JOIN artists a ON SETCONTAINS(t.artists_uris, a._id)
+WHERE
+    p.playlist_followers > 10000
+GROUP BY
+    p.name, t.name, a._id
+HAVING
+    COUNT(DISTINCT t._id) > 5
+ORDER BY
+    total_artist_followers DESC;
+
+-- Find the most popular playlists by the average popularity of their tracks.
+SELECT
+    p.name AS playlist_name,
+    COUNT(DISTINCT t._id) AS track_count,
+    AVG(t.popularity) AS average_track_popularity,
+    MAX(a.artist_followers) AS max_artist_followers
+FROM
+    playlists p
+    JOIN tracks t ON SETCONTAINS(t.playlist_uris, p.uri)
+    JOIN artists a ON SETCONTAINS(t.artists_uris, a._id)
+WHERE
+    p.n_tracks > 50
+GROUP BY
+    p.name
+HAVING
+    COUNT(DISTINCT t._id) > 10
+ORDER BY
+    max_artist_followers DESC
+LIMIT 5;
+
+-- Find the most popular playlists by the average popularity of their tracks, but only for playlists that contain tracks from a specific artist.
+SELECT
+    p.name AS playlist_name,
+    COUNT(DISTINCT t._id) AS track_count,
+    AVG(t.popularity) AS average_track_popularity,
+    MAX(a.artist_followers) AS max_artist_followers
+FROM
+    playlists p
+    JOIN tracks t ON SETCONTAINS(t.playlist_uris, p.uri)
+    JOIN artists a ON SETCONTAINSANY(t.artists_uris, ['spotify:artist:0001wHqxbF2YYRQxGdbyER'])
+WHERE
+    p.playlist_followers > 10000
+GROUP BY
+    p.name
+HAVING
+    COUNT(DISTINCT t._id) > 10
+ORDER BY
+    max_artist_followers DESC;
+
+-- Find the substrings of the first two characters of the name of each track.
+SELECT
+    _id,
+    SUBSTRING(name, 1, 2) AS substr
+FROM
+    tracks;
+
+-- Find the artists who have collaborated with the most number of other artists:
+SELECT a1._id AS artist_id, COUNT(DISTINCT a2._id) AS collaborator_count
+FROM artists a1
+JOIN tracks t1 ON SETCONTAINS(t1.artists_uris, a1._id)
+JOIN artists a2 ON SETCONTAINS(t1.artists_uris, a2._id) AND a1._id != a2._id
+GROUP BY a1._id
+ORDER BY collaborator_count DESC
+LIMIT 10;
+
+-- Find the artists with the highest average track popularity:
+SELECT a._id AS artist_id, AVG(t.popularity) AS average_track_popularity
+FROM artists a
+JOIN tracks t ON SETCONTAINS(t.artists_uris, a._id)
+GROUP BY a._id
+ORDER BY average_track_popularity DESC
+LIMIT 10;
+
+-- Find the artists with the most collaborative tracks:
+SELECT a._id AS artist_id, COUNT(DISTINCT t._id) AS collaborative_track_count
+FROM artists a
+JOIN tracks t ON SETCONTAINS(t.artists_uris, a._id)
+GROUP BY a._id
+ORDER BY collaborative_track_count DESC
+LIMIT 10;
+
+
diff --git a/data/spotify/readme.md b/data/spotify/readme.md
@@ -0,0 +1,52 @@
+# FbSql Spotify Setup
+
+## Dataset
+
+The data for this project has been taken from Kaggle and is available at this URL: [278k Spotify Songs](https://www.kaggle.com/datasets/viktoriiashkurenko/278k-spotify-songs).
+
+## Serverless Database Setup
+
+1. Sign in to [Featurebase Cloud](https://www.featurebase.com) using your credentials.
+
+2. Create a serverless database named "spotify" in Featurebase Cloud.
+
+## Installation and Configuration
+
+1. Install FbSql locally on your machine.
+
+2. Open your terminal or command prompt and run the following command to connect to Featurebase Cloud using FbSql:
+
+   ```shell
+   fbsql --host="https://query.featurebase.com" --email="acappelletti@molecula.com" --password="<yourPassword>"
+   ```
+
+3. Once connected, use the following command to connect to the "spotify" database:
+   ```shell
+   \c spotify
+   ```
+4. After connecting to the "spotify" database, increase the number of workers to six for better performance:
+   ```shell
+   alter database spotify with units 6;
+   ```
+
+## Data Import
+
+1. Download the dataset from Kaggle: [278k Spotify Songs](https://www.kaggle.com/datasets/viktoriiashkurenko/278k-spotify-songs).
+
+2. Upload the dataset to an online file-sharing service like [transfer.sh](https://transfer.sh/).
+
+3. Use the provided SQL files (`insert_playlists.sql`, `insert_artists.sql`, `insert_tracks.sql`) to create and populate the respective tables:
+
+   ```shell
+   \i insert_playlists.sql
+   \i insert_artists.sql
+   \i insert_tracks.sql
+   ```
+
+## Running Queries
+
+To run the queries, execute the `queries.sql` file:
+
+```shell
+\i queries.sql
+```