Common data cleaning and transforming tasks in SQL
https://www.datacamp.com/tutorial/cleaning-data-sql#!
https://towardsdatascience.com/cleaning-and-transforming-data-with-sql-f93c4de0d2fc
filtering out NULL
-- There are null values in column b
-- We want to remove those rows as they are invalid data
-- Query produces an output with NULL values from column be removed.
SELECT
column_a,
column_b,
column_c,
FROM table_1
WHERE column_b IS NOT NULL
Changing column data types
-- Sometimes columns are not in an appropriate type.
-- For example, a numeric column might be string type within the database
--
SELECT
column_a,
CAST(column_b AS NUMERIC) AS column_b,
column_c
FROM table_1
Changing column names
-- Sometimes columns are not named appropriately
-- Using `AS` when selecting columns can be used to rename
-- columns in the output results
SELECT
column_a AS the_first_column
column_b AS a_better_name_for_the_second_column,
column_c AS clear_and_understandable_name
FROM table_1
Replacing strings
Identifying duplicate rows
SELECT COUNT(*)
FROM
(SELECT
column_a,
column_b,
column_c,
COUNT(*) AS records
FROM table_1
GROUP BY
column_a,
column_b,
column_c) a
WHERE records > 1;
-- rewritten as CTE
WITH duplicate_count AS (
SELECT
column_a,
column_b,
column_c,
COUNT(*) AS records
FROM table_1
GROUP BY
column_a,
column_b,
column_c
)
-- count rows where duplicates have been found
SELECT COUNT(*)
FROM duplicate_count
WHERE records > 1;