Difference between revisions of "Patent Data Cleanup (June 2016)"
Jump to navigation
Jump to search
(Created page with "== About this Page == This page contains the script that was used to clean up the patents and assignees tables in allpatent. == Script ==") |
|||
| (12 intermediate revisions by 4 users not shown) | |||
| Line 1: | Line 1: | ||
| + | {{Project | ||
| + | |Has project output=Data | ||
| + | |Has sponsor=McNair Center | ||
| + | |Has title=Patent Data Cleanup - June 2016 | ||
| + | |Has owner=Marcela Interiano, | ||
| + | |Has project status=Subsume | ||
| + | |Has keywords=Data | ||
| + | }} | ||
== About this Page == | == About this Page == | ||
This page contains the script that was used to clean up the patents and assignees tables in allpatent. | This page contains the script that was used to clean up the patents and assignees tables in allpatent. | ||
| + | |||
| + | Cleaning up includes: | ||
| + | * Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap. | ||
| + | ** The 'NULL's got replaced with NULL | ||
| + | ** The -1s got replaced with NULL as well. | ||
| + | |||
| + | * Merging some more columns, and dropping unnecessary columns: | ||
| + | ** At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been. | ||
| + | ** The script that follows merges those columns as well. | ||
| + | NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors | ||
| + | |||
| + | * Renaming tables and columns | ||
| + | ** Table names and column names have been standardized. | ||
| + | ** General rule of thumb is : short column names, singular table names (for example : patent and not patents) | ||
== Script == | == Script == | ||
| + | ALTER TABLE patents | ||
| + | RENAME COLUMN patentnumber TO patent; | ||
| + | |||
| + | ALTER TABLE patents | ||
| + | DROP COLUMN kind, | ||
| + | DROP COLUMN title, | ||
| + | DROP COLUMN ussubclass, ** | ||
| + | DROP COLUMN maingroup, -- | ||
| + | DROP COLUMN subgroup,-- | ||
| + | DROP COLUMN cpcsubclass, ++ | ||
| + | DROP COLUMN cpcmaingroup, ++ | ||
| + | DROP COLUMN classificationnationalcountry, | ||
| + | DROP COLUMN classificationnationalclass,** (?) | ||
| + | DROP COLUMN primaryexaminerfirstname, | ||
| + | DROP COLUMN primaryexaminerlastname, | ||
| + | DROP COLUMN primaryexaminerdepartment, | ||
| + | DROP COLUMN filename; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET type = '2015' | ||
| + | WHERE type != 'NULL'; | ||
| + | |||
| + | -- RESULT : UPDATE 1646225 | ||
| + | |||
| + | UPDATE patents | ||
| + | SET type = '2010' | ||
| + | WHERE type = 'NULL'; | ||
| + | -- RESULT : UPDATE 3764926 | ||
| + | |||
| + | /* Join the historical patent data from the US PTO with the patents table */ | ||
| + | |||
| + | ALTER TABLE PATENTS | ||
| + | ADD COLUMN nber INT, | ||
| + | ADD COLUMN uspc varchar, | ||
| + | ADD COLUMN uspc_sub varchar; | ||
| + | |||
| + | UPDATE patents p | ||
| + | SET nber = hp.nber, | ||
| + | uspc = hp.uspc, | ||
| + | uspc_sub = hp.uspc | ||
| + | FROM historicalpatentdata hp | ||
| + | WHERE hp.patentnumber = CAST(p.patent AS varchar); | ||
| + | |||
| + | -- RESULT : UPDATE 5113655 | ||
| + | |||
| + | /* Mergeing some columns - claims and number of claims - column name : claims*/ | ||
| + | UPDATE patents | ||
| + | SET claims = numberofclaims | ||
| + | WHERE claims = -1; | ||
| + | -- RESULTS : UPDATE 1646225 | ||
| + | |||
| + | /* Merging columns - | ||
| + | UPDATE patents | ||
| + | SET appnum = CAST (applicationnumber AS INT) | ||
| + | where appnum = -1; | ||
| + | -- RESULT : UPDATE 1646225 | ||
| + | |||
| + | UPDATE patents | ||
| + | SET appdate = filingdate | ||
| + | where appdate = '0001-01-01 BC' | ||
| + | OR filingdate is not NULL; | ||
| + | |||
| + | -- RESULT UPDATE 1646225 | ||
| + | |||
| + | ALTER TABLE patents | ||
| + | DROP COLUMN apptype; | ||
| + | |||
| + | /* Generating GYear and AppYear from the dates */ | ||
| + | UPDATE patents | ||
| + | SET gyear = EXTRACT(year from grantdate) | ||
| + | WHERE gyear = -1 | ||
| + | AND grantdate IS NOT NULL; | ||
| + | UPDATE 1646225 | ||
| + | |||
| + | UPDATE patents | ||
| + | SET appyear = EXTRACT(year from appdate) | ||
| + | WHERE appyear = -1 | ||
| + | AND appdate is not null; | ||
| + | -- RESULT UPDATE 1646225 | ||
| + | |||
| + | /* Test Script */ | ||
| + | SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100; | ||
| + | |||
| + | patent | integer | not null | ||
| + | grantdate | date | | ||
| + | type | character varying | | ||
| + | applicationnumber | character varying | | ||
| + | filingdate | date | | ||
| + | prioritydate | date | | ||
| + | prioritycountry | character varying | | ||
| + | prioritypatentnumber | character varying | | ||
| + | cpcsubgroup | character varying | | ||
| + | numberofclaims | integer | | ||
| + | pctpatentnumber | character varying | | ||
| + | claims | integer | | ||
| + | appnum | integer | | ||
| + | gyear | integer | | ||
| + | appdate | date | | ||
| + | appyear | integer | | ||
| + | nber | integer | | ||
| + | uspc | character varying | | ||
| + | uspc_sub | character varying | ||
| + | |||
| + | /* Drop the merged columns */ | ||
| + | |||
| + | ALTER TABLE patents | ||
| + | DROP COLUMN numberofclaims, | ||
| + | DROP COLUMN filingdate, | ||
| + | DROP COLUMN applicationnumber, | ||
| + | DROP COLUMN type; | ||
| + | |||
| + | |||
| + | UPDATE patents | ||
| + | SET prioritycountry = NULL | ||
| + | WHERE prioritycountry = 'NULL'; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET pctpatentnumber = NULL | ||
| + | WHERE pctpatentnumber = 'NULL'; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET prioritypatentnumber = NULL | ||
| + | WHERE prioritypatentnumber = 'NULL'; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET cpcsubgroup = NULL | ||
| + | WHERE cpcsubgroup = 'NULL'; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET appnum = NULL | ||
| + | WHERE appnum = -1; | ||
| + | |||
| + | UPDATE patents | ||
| + | SET gyear = NULL | ||
| + | WHERE gyear = -1; | ||
| + | |||
| + | UPDATE patents | ||
| + | |||
| + | SET appyear = NULL | ||
| + | WHERE appyear = -1; | ||
| + | |||
| + | Results: | ||
| + | |||
| + | allpatent=# ALTER TABLE patents | ||
| + | allpatent-# DROP COLUMN numberofclaims, | ||
| + | allpatent-# DROP COLUMN filingdate, | ||
| + | allpatent-# DROP COLUMN applicationnumber, | ||
| + | allpatent-# DROP COLUMN type; | ||
| + | ALTER TABLE | ||
| + | allpatent=# | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET prioritycountry = NULL | ||
| + | allpatent-# WHERE prioritycountry = 'NULL'; | ||
| + | ^[[BUPDATE 3764926 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET pctpatentnumber = NULL | ||
| + | allpatent-# WHERE pctpatentnumber = 'NULL'; | ||
| + | UPDATE 3764926 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET prioritypatentnumber = NULL | ||
| + | allpatent-# WHERE prioritypatentnumber = 'NULL'; | ||
| + | UPDATE 3764926 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET cpcsubgroup = NULL | ||
| + | allpatent-# WHERE cpcsubgroup = 'NULL'; | ||
| + | UPDATE 3764926 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET appnum = NULL | ||
| + | allpatent-# WHERE appnum = -1; | ||
| + | UPDATE 0 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET gyear = NULL | ||
| + | allpatent-# WHERE gyear = -1; | ||
| + | UPDATE 0 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE patents | ||
| + | allpatent-# SET appyear = NULL | ||
| + | allpatent-# WHERE appyear = -1; | ||
| + | UPDATE 0 | ||
| + | allpatent=# | ||
| + | |||
| + | |||
| + | UPDATE assignees | ||
| + | SET lastname = NULL | ||
| + | WHERE lastname = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET firstname = NULL | ||
| + | WHERE firstname = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET address = NULL | ||
| + | WHERE address = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET postcode = NULL | ||
| + | WHERE postcode = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET patentcountry = NULL | ||
| + | WHERE patentcountry = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET nationality2 = NULL | ||
| + | WHERE nationality2 = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET residence = NULL | ||
| + | WHERE residence = 'null'; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET asgseq = NULL | ||
| + | WHERE asgseq= -1; | ||
| + | |||
| + | UPDATE assignees | ||
| + | SET asgtype = NULL | ||
| + | WHERE asgtype = -1; | ||
| + | |||
| + | RESULTS: | ||
| + | |||
| + | UPDATE assignees | ||
| + | allpatent-# SET lastname = NULL | ||
| + | allpatent-# WHERE lastname = 'null'; | ||
| + | |||
| + | UPDATE 3818842 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET firstname = NULL | ||
| + | allpatent-# WHERE firstname = 'null'; | ||
| + | UPDATE 3818842 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET address = NULL | ||
| + | allpatent-# WHERE address = 'null'; | ||
| + | UPDATE 3818842 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET postcode = NULL | ||
| + | allpatent-# WHERE postcode = 'null'; | ||
| + | UPDATE 3818842 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET patentcountry = NULL | ||
| + | allpatent-# WHERE patentcountry = 'null'; | ||
| + | UPDATE 3818842 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET nationality2 = NULL | ||
| + | allpatent-# WHERE nationality2 = 'null'; | ||
| + | UPDATE 1607714 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET residence = NULL | ||
| + | allpatent-# WHERE residence = 'null'; | ||
| + | UPDATE 1607714 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET asgseq = NULL | ||
| + | allpatent-# WHERE asgseq= -1; | ||
| + | UPDATE 1607714 | ||
| + | allpatent=# | ||
| + | allpatent=# UPDATE assignees | ||
| + | allpatent-# SET asgtype = NULL | ||
| + | allpatent-# WHERE asgtype = -1; | ||
| + | UPDATE 1607714 | ||
| + | allpatent=# | ||
| + | |||
| + | |||
| + | CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser; | ||
| + | |||
| + | == Renaming Tables and Columns == | ||
| + | |||
| + | To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed. | ||
| + | * '''allpatent''' database -> '''patent''' | ||
| + | * assignees -> assignee | ||
| + | * judges -> judge | ||
| + | * citations -> citation | ||
| + | * matchassignees -> MatchOrgNames | ||
| + | * patents -> patent | ||
| + | * assignees -> ptoassignee | ||
| + | * assignments -> ptoassignment | ||
| + | * assignors -> ptoassignor | ||
| + | * patentassignment -> ptopatentfile | ||
| + | * properties -> ptoproperty | ||
| + | * mslfee -> feestatus | ||
| + | * patentmaintenancefee -> fee | ||
Latest revision as of 12:41, 21 September 2020
| Patent Data Cleanup (June 2016) | |
|---|---|
| Project Information | |
| Has title | Patent Data Cleanup - June 2016 |
| Has owner | Marcela Interiano |
| Has start date | |
| Has deadline date | |
| Has keywords | Data |
| Has project status | Subsume |
| Subsumed by: | Patent Assignment Data Restructure |
| Has sponsor | McNair Center |
| Has project output | Data |
| Copyright © 2019 edegan.com. All Rights Reserved. | |
About this Page
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
Cleaning up includes:
- Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
- The 'NULL's got replaced with NULL
- The -1s got replaced with NULL as well.
- Merging some more columns, and dropping unnecessary columns:
- At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
- The script that follows merges those columns as well.
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
- Renaming tables and columns
- Table names and column names have been standardized.
- General rule of thumb is : short column names, singular table names (for example : patent and not patents)
Script
ALTER TABLE patents RENAME COLUMN patentnumber TO patent;
ALTER TABLE patents DROP COLUMN kind, DROP COLUMN title, DROP COLUMN ussubclass, ** DROP COLUMN maingroup, -- DROP COLUMN subgroup,-- DROP COLUMN cpcsubclass, ++ DROP COLUMN cpcmaingroup, ++ DROP COLUMN classificationnationalcountry, DROP COLUMN classificationnationalclass,** (?) DROP COLUMN primaryexaminerfirstname, DROP COLUMN primaryexaminerlastname, DROP COLUMN primaryexaminerdepartment, DROP COLUMN filename;
UPDATE patents SET type = '2015' WHERE type != 'NULL';
-- RESULT : UPDATE 1646225
UPDATE patents SET type = '2010' WHERE type = 'NULL'; -- RESULT : UPDATE 3764926
/* Join the historical patent data from the US PTO with the patents table */
ALTER TABLE PATENTS ADD COLUMN nber INT, ADD COLUMN uspc varchar, ADD COLUMN uspc_sub varchar;
UPDATE patents p SET nber = hp.nber, uspc = hp.uspc, uspc_sub = hp.uspc FROM historicalpatentdata hp WHERE hp.patentnumber = CAST(p.patent AS varchar);
-- RESULT : UPDATE 5113655
/* Mergeing some columns - claims and number of claims - column name : claims*/ UPDATE patents SET claims = numberofclaims WHERE claims = -1; -- RESULTS : UPDATE 1646225
/* Merging columns - UPDATE patents SET appnum = CAST (applicationnumber AS INT) where appnum = -1; -- RESULT : UPDATE 1646225
UPDATE patents SET appdate = filingdate where appdate = '0001-01-01 BC' OR filingdate is not NULL;
-- RESULT UPDATE 1646225
ALTER TABLE patents DROP COLUMN apptype;
/* Generating GYear and AppYear from the dates */ UPDATE patents SET gyear = EXTRACT(year from grantdate) WHERE gyear = -1 AND grantdate IS NOT NULL; UPDATE 1646225
UPDATE patents SET appyear = EXTRACT(year from appdate) WHERE appyear = -1 AND appdate is not null; -- RESULT UPDATE 1646225
/* Test Script */ SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
patent | integer | not null grantdate | date | type | character varying | applicationnumber | character varying | filingdate | date | prioritydate | date | prioritycountry | character varying | prioritypatentnumber | character varying | cpcsubgroup | character varying | numberofclaims | integer | pctpatentnumber | character varying | claims | integer | appnum | integer | gyear | integer | appdate | date | appyear | integer | nber | integer | uspc | character varying | uspc_sub | character varying
/* Drop the merged columns */
ALTER TABLE patents DROP COLUMN numberofclaims, DROP COLUMN filingdate, DROP COLUMN applicationnumber, DROP COLUMN type;
UPDATE patents SET prioritycountry = NULL WHERE prioritycountry = 'NULL';
UPDATE patents SET pctpatentnumber = NULL WHERE pctpatentnumber = 'NULL';
UPDATE patents SET prioritypatentnumber = NULL WHERE prioritypatentnumber = 'NULL';
UPDATE patents SET cpcsubgroup = NULL WHERE cpcsubgroup = 'NULL';
UPDATE patents SET appnum = NULL WHERE appnum = -1;
UPDATE patents SET gyear = NULL WHERE gyear = -1;
UPDATE patents
SET appyear = NULL WHERE appyear = -1;
Results:
allpatent=# ALTER TABLE patents allpatent-# DROP COLUMN numberofclaims, allpatent-# DROP COLUMN filingdate, allpatent-# DROP COLUMN applicationnumber, allpatent-# DROP COLUMN type; ALTER TABLE allpatent=# allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritycountry = NULL allpatent-# WHERE prioritycountry = 'NULL'; ^[[BUPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET pctpatentnumber = NULL allpatent-# WHERE pctpatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritypatentnumber = NULL allpatent-# WHERE prioritypatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET cpcsubgroup = NULL allpatent-# WHERE cpcsubgroup = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET appnum = NULL allpatent-# WHERE appnum = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET gyear = NULL allpatent-# WHERE gyear = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET appyear = NULL allpatent-# WHERE appyear = -1; UPDATE 0 allpatent=#
UPDATE assignees SET lastname = NULL WHERE lastname = 'null';
UPDATE assignees SET firstname = NULL WHERE firstname = 'null';
UPDATE assignees SET address = NULL WHERE address = 'null';
UPDATE assignees SET postcode = NULL WHERE postcode = 'null';
UPDATE assignees SET patentcountry = NULL WHERE patentcountry = 'null';
UPDATE assignees SET nationality2 = NULL WHERE nationality2 = 'null';
UPDATE assignees SET residence = NULL WHERE residence = 'null';
UPDATE assignees SET asgseq = NULL WHERE asgseq= -1;
UPDATE assignees SET asgtype = NULL WHERE asgtype = -1;
RESULTS:
UPDATE assignees allpatent-# SET lastname = NULL allpatent-# WHERE lastname = 'null';
UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET firstname = NULL allpatent-# WHERE firstname = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET address = NULL allpatent-# WHERE address = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET postcode = NULL allpatent-# WHERE postcode = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET patentcountry = NULL allpatent-# WHERE patentcountry = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET nationality2 = NULL allpatent-# WHERE nationality2 = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET residence = NULL allpatent-# WHERE residence = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgseq = NULL allpatent-# WHERE asgseq= -1; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgtype = NULL allpatent-# WHERE asgtype = -1; UPDATE 1607714 allpatent=#
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;
Renaming Tables and Columns
To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.
- allpatent database -> patent
- assignees -> assignee
- judges -> judge
- citations -> citation
- matchassignees -> MatchOrgNames
- patents -> patent
- assignees -> ptoassignee
- assignments -> ptoassignment
- assignors -> ptoassignor
- patentassignment -> ptopatentfile
- properties -> ptoproperty
- mslfee -> feestatus
- patentmaintenancefee -> fee