Patent Data Cleanup (June 2016)

From edegan.com
Revision as of 13:12, 20 September 2020 by Ed (talk | contribs)
Jump to navigation Jump to search


Project
Patent Data Cleanup (June 2016)
Project logo 02.png
Project Information
Has title Patent Data Cleanup - June 2016
Has owner Marcela Interiano
Has start date
Has deadline date
Has keywords Data
Has project status Subsume
Subsumed by: Patent Assignment Data Restructure
Copyright © 2019 edegan.com. All Rights Reserved.

About this Page

This page contains the script that was used to clean up the patents and assignees tables in allpatent.

Cleaning up includes:

  • Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
    • The 'NULL's got replaced with NULL
    • The -1s got replaced with NULL as well.
  • Merging some more columns, and dropping unnecessary columns:
    • At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
    • The script that follows merges those columns as well.
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
  • Renaming tables and columns
    • Table names and column names have been standardized.
    • General rule of thumb is : short column names, singular table names (for example : patent and not patents)

Script

ALTER TABLE patents
RENAME COLUMN patentnumber TO patent;
ALTER TABLE patents
 DROP COLUMN kind,
 DROP COLUMN title,
 DROP COLUMN ussubclass, **
 DROP COLUMN maingroup, --
 DROP COLUMN subgroup,--
 DROP COLUMN cpcsubclass, ++
 DROP COLUMN cpcmaingroup, ++
 DROP COLUMN classificationnationalcountry,
 DROP COLUMN classificationnationalclass,** (?)
 DROP COLUMN primaryexaminerfirstname,
 DROP COLUMN primaryexaminerlastname,
 DROP COLUMN primaryexaminerdepartment,
 DROP COLUMN filename;
UPDATE patents
 SET type = '2015'
 WHERE type != 'NULL';
-- RESULT : UPDATE 1646225
UPDATE patents
 SET type = '2010'
 WHERE type = 'NULL';
-- RESULT : UPDATE 3764926
/* Join the historical patent data from the US PTO with the patents table */
ALTER TABLE PATENTS
 ADD COLUMN nber INT,
 ADD COLUMN uspc varchar,
 ADD COLUMN uspc_sub varchar;
UPDATE patents p
 SET nber = hp.nber,
 uspc = hp.uspc,
 uspc_sub = hp.uspc
  FROM historicalpatentdata hp
  WHERE hp.patentnumber = CAST(p.patent AS varchar);
-- RESULT : UPDATE 5113655
/* Mergeing some columns - claims and number of claims - column name : claims*/
UPDATE patents
 SET claims = numberofclaims
 WHERE claims = -1;
-- RESULTS : UPDATE 1646225
/* Merging columns - 
UPDATE patents
 SET appnum = CAST (applicationnumber AS INT)
 where appnum = -1;
-- RESULT : UPDATE 1646225
UPDATE patents
 SET appdate = filingdate
 where appdate = '0001-01-01 BC'
 OR filingdate is not NULL;
-- RESULT UPDATE 1646225
ALTER TABLE patents
 DROP COLUMN apptype;
/* Generating GYear and AppYear from the dates */
UPDATE patents
 SET gyear = EXTRACT(year from grantdate)
 WHERE gyear = -1
 AND grantdate IS NOT NULL;
UPDATE 1646225
UPDATE patents
 SET appyear = EXTRACT(year from appdate)
 WHERE appyear = -1 
 AND appdate is not null;
-- RESULT UPDATE 1646225
/* Test Script */
SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
patent               | integer           | not null
 grantdate            | date              |
 type                 | character varying |
 applicationnumber    | character varying |
 filingdate           | date              |
 prioritydate         | date              |
 prioritycountry      | character varying |
 prioritypatentnumber | character varying |
 cpcsubgroup          | character varying |
 numberofclaims       | integer           |
 pctpatentnumber      | character varying |
 claims               | integer           |
 appnum               | integer           |
 gyear                | integer           |
 appdate              | date              |
 appyear              | integer           |
 nber                 | integer           |
 uspc                 | character varying |
 uspc_sub             | character varying 
/* Drop the merged columns */
ALTER TABLE patents
 DROP COLUMN numberofclaims,
 DROP COLUMN filingdate,
 DROP COLUMN applicationnumber,
 DROP COLUMN type;


UPDATE patents
 SET prioritycountry = NULL
 WHERE prioritycountry = 'NULL';
UPDATE patents
 SET pctpatentnumber = NULL
 WHERE pctpatentnumber = 'NULL';
UPDATE patents
 SET prioritypatentnumber = NULL
 WHERE prioritypatentnumber = 'NULL';
UPDATE patents
 SET cpcsubgroup = NULL
 WHERE cpcsubgroup = 'NULL';
UPDATE patents
 SET appnum = NULL
 WHERE appnum = -1;
UPDATE patents
 SET gyear = NULL
 WHERE gyear = -1;
UPDATE patents
 SET appyear = NULL
 WHERE appyear = -1;
Results:
allpatent=# ALTER TABLE patents
allpatent-# DROP COLUMN numberofclaims,
allpatent-# DROP COLUMN filingdate,
allpatent-# DROP COLUMN applicationnumber,
allpatent-# DROP COLUMN type;
ALTER TABLE
allpatent=#
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritycountry = NULL
allpatent-# WHERE prioritycountry = 'NULL';
^[[BUPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET pctpatentnumber = NULL
allpatent-# WHERE pctpatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritypatentnumber = NULL
allpatent-# WHERE prioritypatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET cpcsubgroup = NULL
allpatent-# WHERE cpcsubgroup = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appnum = NULL
allpatent-# WHERE appnum = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET gyear = NULL
allpatent-# WHERE gyear = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appyear = NULL
allpatent-# WHERE appyear = -1;
UPDATE 0
allpatent=#


UPDATE assignees
 SET lastname = NULL
 WHERE lastname = 'null';
UPDATE assignees
 SET firstname = NULL
 WHERE firstname = 'null';
UPDATE assignees
 SET address = NULL
 WHERE address = 'null';
UPDATE assignees
 SET postcode = NULL
 WHERE   postcode = 'null';
UPDATE assignees
 SET patentcountry = NULL
 WHERE patentcountry = 'null';
UPDATE assignees
 SET nationality2 = NULL
 WHERE nationality2 = 'null';
UPDATE assignees
 SET residence = NULL
 WHERE residence = 'null';
UPDATE assignees
 SET asgseq = NULL
 WHERE asgseq= -1;
UPDATE assignees
 SET asgtype = NULL
 WHERE asgtype = -1;
RESULTS:
UPDATE assignees
allpatent-# SET lastname = NULL
allpatent-# WHERE lastname = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET firstname = NULL
allpatent-# WHERE firstname = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET address = NULL
allpatent-# WHERE address = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET postcode = NULL
allpatent-# WHERE   postcode = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET patentcountry = NULL
allpatent-# WHERE patentcountry = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET nationality2 = NULL
allpatent-# WHERE nationality2 = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET residence = NULL
allpatent-# WHERE residence = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgseq = NULL
allpatent-# WHERE asgseq= -1;
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgtype = NULL
allpatent-# WHERE asgtype = -1;
UPDATE 1607714
allpatent=#


CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;

Renaming Tables and Columns

To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.

  • allpatent database -> patent
  • assignees -> assignee
  • judges -> judge
  • citations -> citation
  • matchassignees -> MatchOrgNames
  • patents -> patent
  • assignees -> ptoassignee
  • assignments -> ptoassignment
  • assignors -> ptoassignor
  • patentassignment -> ptopatentfile
  • properties -> ptoproperty
  • mslfee -> feestatus
  • patentmaintenancefee -> fee