Changes

Jump to navigation Jump to search
no edit summary
{{Project
|Has project output=Data
|Has sponsor=McNair Center
|Has title=Patent Data Cleanup - June 2016
|Has owner=Marcela Interiano,
|Has project status=Subsume
|Has keywords=Data
}}
== About this Page ==
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
 
Cleaning up includes:
* Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
** The 'NULL's got replaced with NULL
** The -1s got replaced with NULL as well.
 
* Merging some more columns, and dropping unnecessary columns:
** At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
** The script that follows merges those columns as well.
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
 
* Renaming tables and columns
** Table names and column names have been standardized.
** General rule of thumb is : short column names, singular table names (for example : patent and not patents)
== Script ==
ALTER TABLE patents
RENAME COLUMN patentnumber TO patent;
 
ALTER TABLE patents
DROP COLUMN kind,
DROP COLUMN title,
DROP COLUMN ussubclass, **
DROP COLUMN maingroup, --
DROP COLUMN subgroup,--
DROP COLUMN cpcsubclass, ++
DROP COLUMN cpcmaingroup, ++
DROP COLUMN classificationnationalcountry,
DROP COLUMN classificationnationalclass,** (?)
DROP COLUMN primaryexaminerfirstname,
DROP COLUMN primaryexaminerlastname,
DROP COLUMN primaryexaminerdepartment,
DROP COLUMN filename;
 
UPDATE patents
SET type = '2015'
WHERE type != 'NULL';
 
-- RESULT : UPDATE 1646225
 
UPDATE patents
SET type = '2010'
WHERE type = 'NULL';
-- RESULT : UPDATE 3764926
 
/* Join the historical patent data from the US PTO with the patents table */
 
ALTER TABLE PATENTS
ADD COLUMN nber INT,
ADD COLUMN uspc varchar,
ADD COLUMN uspc_sub varchar;
 
UPDATE patents p
SET nber = hp.nber,
uspc = hp.uspc,
uspc_sub = hp.uspc
FROM historicalpatentdata hp
WHERE hp.patentnumber = CAST(p.patent AS varchar);
 
-- RESULT : UPDATE 5113655
 
/* Mergeing some columns - claims and number of claims - column name : claims*/
UPDATE patents
SET claims = numberofclaims
WHERE claims = -1;
-- RESULTS : UPDATE 1646225
 
/* Merging columns -
UPDATE patents
SET appnum = CAST (applicationnumber AS INT)
where appnum = -1;
-- RESULT : UPDATE 1646225
 
UPDATE patents
SET appdate = filingdate
where appdate = '0001-01-01 BC'
OR filingdate is not NULL;
 
-- RESULT UPDATE 1646225
 
ALTER TABLE patents
DROP COLUMN apptype;
 
/* Generating GYear and AppYear from the dates */
UPDATE patents
SET gyear = EXTRACT(year from grantdate)
WHERE gyear = -1
AND grantdate IS NOT NULL;
UPDATE 1646225
 
UPDATE patents
SET appyear = EXTRACT(year from appdate)
WHERE appyear = -1
AND appdate is not null;
-- RESULT UPDATE 1646225
 
/* Test Script */
SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
 
patent | integer | not null
grantdate | date |
type | character varying |
applicationnumber | character varying |
filingdate | date |
prioritydate | date |
prioritycountry | character varying |
prioritypatentnumber | character varying |
cpcsubgroup | character varying |
numberofclaims | integer |
pctpatentnumber | character varying |
claims | integer |
appnum | integer |
gyear | integer |
appdate | date |
appyear | integer |
nber | integer |
uspc | character varying |
uspc_sub | character varying
 
/* Drop the merged columns */
 
ALTER TABLE patents
DROP COLUMN numberofclaims,
DROP COLUMN filingdate,
DROP COLUMN applicationnumber,
DROP COLUMN type;
 
 
UPDATE patents
SET prioritycountry = NULL
WHERE prioritycountry = 'NULL';
 
UPDATE patents
SET pctpatentnumber = NULL
WHERE pctpatentnumber = 'NULL';
 
UPDATE patents
SET prioritypatentnumber = NULL
WHERE prioritypatentnumber = 'NULL';
 
UPDATE patents
SET cpcsubgroup = NULL
WHERE cpcsubgroup = 'NULL';
 
UPDATE patents
SET appnum = NULL
WHERE appnum = -1;
 
UPDATE patents
SET gyear = NULL
WHERE gyear = -1;
 
UPDATE patents
 
SET appyear = NULL
WHERE appyear = -1;
 
Results:
 
allpatent=# ALTER TABLE patents
allpatent-# DROP COLUMN numberofclaims,
allpatent-# DROP COLUMN filingdate,
allpatent-# DROP COLUMN applicationnumber,
allpatent-# DROP COLUMN type;
ALTER TABLE
allpatent=#
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritycountry = NULL
allpatent-# WHERE prioritycountry = 'NULL';
^[[BUPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET pctpatentnumber = NULL
allpatent-# WHERE pctpatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritypatentnumber = NULL
allpatent-# WHERE prioritypatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET cpcsubgroup = NULL
allpatent-# WHERE cpcsubgroup = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appnum = NULL
allpatent-# WHERE appnum = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET gyear = NULL
allpatent-# WHERE gyear = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appyear = NULL
allpatent-# WHERE appyear = -1;
UPDATE 0
allpatent=#
 
 
UPDATE assignees
SET lastname = NULL
WHERE lastname = 'null';
 
UPDATE assignees
SET firstname = NULL
WHERE firstname = 'null';
 
UPDATE assignees
SET address = NULL
WHERE address = 'null';
 
UPDATE assignees
SET postcode = NULL
WHERE postcode = 'null';
 
UPDATE assignees
SET patentcountry = NULL
WHERE patentcountry = 'null';
 
UPDATE assignees
SET nationality2 = NULL
WHERE nationality2 = 'null';
 
UPDATE assignees
SET residence = NULL
WHERE residence = 'null';
 
UPDATE assignees
SET asgseq = NULL
WHERE asgseq= -1;
 
UPDATE assignees
SET asgtype = NULL
WHERE asgtype = -1;
 
RESULTS:
 
UPDATE assignees
allpatent-# SET lastname = NULL
allpatent-# WHERE lastname = 'null';
 
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET firstname = NULL
allpatent-# WHERE firstname = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET address = NULL
allpatent-# WHERE address = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET postcode = NULL
allpatent-# WHERE postcode = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET patentcountry = NULL
allpatent-# WHERE patentcountry = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET nationality2 = NULL
allpatent-# WHERE nationality2 = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET residence = NULL
allpatent-# WHERE residence = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgseq = NULL
allpatent-# WHERE asgseq= -1;
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgtype = NULL
allpatent-# WHERE asgtype = -1;
UPDATE 1607714
allpatent=#
 
 
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;
 
== Renaming Tables and Columns ==
 
To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.
* '''allpatent''' database -> '''patent'''
* assignees -> assignee
* judges -> judge
* citations -> citation
* matchassignees -> MatchOrgNames
* patents -> patent
* assignees -> ptoassignee
* assignments -> ptoassignment
* assignors -> ptoassignor
* patentassignment -> ptopatentfile
* properties -> ptoproperty
* mslfee -> feestatus
* patentmaintenancefee -> fee

Navigation menu