Difference between revisions of "Patent Data Processing - SQL Steps"

From edegan.com
Jump to navigation Jump to search
 
(33 intermediate revisions by 6 users not shown)
Line 1: Line 1:
** Assignees Data:
+
{{Project
CREATE TABLE assignees_temp2 (
+
|Has project output=
lastname  varchar,
+
|Has sponsor=McNair Center
firstname varchar,
+
|Has title=Patent Data Processing - SQL Steps
address varchar,
+
|Has owner=Shelby Bice,
postcode varchar,
+
|Has project status=Subsume
orgname varchar, --assignee varchar,
+
|Has keywords=Tool
city varchar, --city2 varchar,
+
}}
country varchar, --country2 varchar,
+
Return to [[Patent Data (Wiki Page)]].
patentnumber int, --patent integer,
 
state varchar, --state2 varchar,
 
patentcountry varchar,
 
nationality2 varchar,
 
residence varchar,
 
asgseq int,
 
asgtype integer
 
);
 
  
CREATE TABLE assignees
+
== Objective ==
(
+
The McNair Center owns two sets of patent data - one set that is inherited from Harvard, the Harvard dataverse, which is stored in the database patentdata and another that is generated by crawlers pulling data from the USPTO website, which is stored in the database '''patent_2015'''.
lastname  varchar,
 
firstname varchar,
 
address varchar,
 
postcode varchar,
 
orgname varchar,
 
city varchar,
 
country varchar,
 
patentnumber int,
 
state varchar,
 
patentcountry varchar,
 
nationality2 varchar,
 
residence varchar,
 
asgseq int,
 
asgtype integer
 
);
 
  
 +
We are now merging and cleaning the two data sets, and storing them in a schema that is amalgamation of the two underlying schema for the citations tables, assignees tables, and patents tables. The destination schema is '''allpatent'''.
  
PatentData:
+
== Assignees Data==
Column    |      Type        | Modifiers
 
-------------+-------------------+-----------
 
patent      | integer          |
 
asgtype    | integer          |
 
assignee    | character varying |
 
city        | character varying |
 
state      | character varying |
 
country    | character varying |
 
nationality | character varying |
 
residence  | character varying |
 
asgseq      | integer          |
 
  
(
+
The schema for the assignees table in '''patentdata''' database is:
patent int,
 
asgtype int,
 
assignee varchar ,
 
city varchar ,
 
state varchar ,
 
country varchar ,
 
nationality varchar ,
 
residence varchar ,
 
asgseq int         
 
)
 
  
INSERT INTO assignees_merge
+
    Column    |      Type        | Modifiers
(
+
-------------+-------------------+-----------
SELECT
+
  patent      | integer          |
'null',
+
  asgtype    | integer          |
'null',
+
  assignee   | character varying |
'null',
+
  city       | character varying |
'null',
+
  state       | character varying |
a.assignee,
+
  country    | character varying |
a.city,
+
  nationality | character varying |
a.country,
+
  residence   | character varying |
a.patent,
+
  asgseq     | integer          |
a.state,
 
'null',
 
a.nationality,
 
a.residence,
 
a.asgseq,
 
a.asgtype
 
FROM assignees a
 
);
 
 
 
/// Reference:
 
CREATE TABLE assignees_temp2 (
 
  
lastname  varchar,
+
The schema for the assignees table in patent_2015 is :
firstname varchar,
 
address varchar,
 
postcode varchar,
 
orgname varchar, --assignee varchar,
 
city varchar, --city2 varchar,
 
country varchar, --country2 varchar,
 
patentnumber int, --patent integer,
 
state varchar, --state2 varchar,
 
patentcountry varchar,
 
nationality2 varchar,
 
residence varchar,
 
asgseq int,
 
asgtype integer
 
  
);
+
    Column    |  Type  | Modifiers
//////
+
  --------------+---------+-----------
 
 
 
 
 
 
Patent_2015:
 
Column    |  Type  | Modifiers
 
  ---------------+---------+-----------
 
 
  lastname      | text    |
 
  lastname      | text    |
 
  firstname    | text    |
 
  firstname    | text    |
Line 118: Line 45:
 
  postcode      | text    |
 
  postcode      | text    |
  
 +
To merge both schemas, we have some columns that overlap, and some columns that don't.
  
(
+
'''Overlapping Columns'''
lastname varchar,
 
firstname varchar,
 
orgname varchar,
 
city varchar,
 
country varchar,
 
patentcountry varchar,
 
patentnumber int,
 
state varchar,
 
address varchar,
 
postcode varchar
 
)
 
  
INSERT INTO assignees_merge
+
  patent_2015  | patentdata
(
+
--------------+--------------
SELECT
+
  orgname     | assignee
assignees.lastname,
+
  city        | city  
assignees.firstname,
+
  country      | country
assignees.address,
+
  patentnumber | patent
assignees.postcode,
+
  state        | state
assignees.orgname,
 
assignees.city,
 
assignees.country,
 
assignees.patentnumber,
 
assignees.state,
 
assignees.patentcountry,
 
'null',
 
'null',
 
-1,
 
-1
 
FROM assignees
 
);
 
  
 +
These columns will have entries for most rows in the table, because they exist in both tables. The rest of the columns will be populated based on which table the row is coming from.
  
\COPY assignees_merge TO '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
'''Final Schema'''
\COPY assignees FROM '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 
--1607724
 
  
\COPY assignees_merge TO '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
        Table "public.assignees"
\COPY assignees FROM '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
    Column    |      Type        | Modifiers
--3818842
+
---------------+-------------------+-----------
 +
  lastname      | character varying |
 +
  firstname    | character varying |
 +
  address      | character varying |
 +
  postcode      | character varying |
 +
  orgname      | character varying |
 +
  city          | character varying |
 +
  country      | character varying |
 +
  patentnumber  | integer          |
 +
  state        | character varying |
 +
  patentcountry | character varying |
 +
  nationality2  | character varying |
 +
  residence    | character varying |
 +
  asgseq        | integer          |
 +
  asgtype      | integer          |
  
 +
'''Non-overlapping Columns'''
 +
These are the columns that belong to either one of the assignees tables, and not to both. For these cases, to help users understand where the row is coming from, the following insert rules have been followed:
  
 +
*For columns of type int, insert -1
 +
*For columns of type string (character varying), the string 'null' has been inserted.
  
 +
Therefore, if a row has appropriate values for orgname, state, city ,etc, but 'null' values for lastname, firstname, address and postcode, the row has come from the patentdata table.
  
SELECT count(*), *
+
==== Index ====
from assignees
+
Since the table is relatively large, and is likely to be searched often, an index has been imposed on the table.
GROUP BY lastname, firstname, address, postcode, orgname, city, country, patentnumber, state, patentcountry, nationality2, residence, asgseq, asgtype
 
HAVING COUNT(*) > 1
 
;
 
  
CREATE TABLE Assignees2 AS
+
  allpatent=# CREATE INDEX ON assignees (orgname);
SELECT *
+
  CREATE INDEX
from assignees  
 
GROUP BY lastname, firstname, address, postcode, orgname, city, country, patentnumber, state, patentcountry, nationality2, residence, asgseq, asgtype
 
HAVING COUNT(*) = 1
 
;
 
  
DROP TABLE Assignees;
+
====Sample insert and copy commands ====
ALTER TABLE Assignees2 RENAME TO Assignees;
+
INSERT INTO assignees_merge
 +
(
 +
SELECT
 +
'null',
 +
'null',
 +
'null',
 +
'null',
 +
a.assignee,
 +
a.city,
 +
a.country,
 +
a.patent,
 +
a.state,
 +
'null',
 +
a.nationality,
 +
a.residence,
 +
a.asgseq,
 +
a.asgtype
 +
FROM assignees a
 +
);
  
--  
+
INSERT INTO assignees_merge
 +
(
 +
SELECT
 +
assignees.lastname,
 +
assignees.firstname,
 +
assignees.address,
 +
assignees.postcode,
 +
assignees.orgname,
 +
assignees.city,
 +
assignees.country,
 +
assignees.patentnumber,
 +
assignees.state,
 +
assignees.patentcountry,
 +
'null',
 +
'null',
 +
-1,
 +
-1
 +
FROM assignees
 +
);
 +
 
 +
 
 +
\COPY assignees_merge TO '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 +
\COPY assignees FROM '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 +
--1607724
 +
 
 +
\COPY assignees_merge TO '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 +
\COPY assignees FROM '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 +
--3818842
  
 +
Note : The assignees table was updated on 6/23 to remove the 'null' string and the '-1' values.
  
 +
==Patents ==
 +
 +
'''Patentdata Schema:'''
  
Patentdata:
 
 
  Column  |      Type        | Modifiers
 
  Column  |      Type        | Modifiers
  ---------+-------------------+-----------
+
  --------+-------------------+-----------
 
  patent  | integer          |
 
  patent  | integer          |
 
  kind    | character varying |
 
  kind    | character varying |
Line 196: Line 160:
 
  appdate | date              |
 
  appdate | date              |
 
  appyear | integer          |
 
  appyear | integer          |
 
Column Names:
 
patent int,
 
kind varchar,
 
claims int,
 
apptype int,
 
appnum int,
 
gdate date,
 
gyear int,
 
appdate date,
 
appyear int
 
  
  
 +
'''Patent_2015 Schema:'''
  
 
Patent_2015:
 
 
             Column            |  Type  | Modifiers
 
             Column            |  Type  | Modifiers
 
  -------------------------------+---------+-----------
 
  -------------------------------+---------+-----------
Line 239: Line 191:
 
   filename                      | varchar    |
 
   filename                      | varchar    |
 
    
 
    
  Column Names:
+
''' Overlapping Columns '''
     patentnumber int,
+
  patent_data     patent_2015
    kind varchar,
+
  --------------+-------------
    grantdate date,
+
     patent      |  patentnumber
    type varchar,
+
     kind        |  kind
    applicationnumber varchar,
+
     claims      |  numberofclaims
    filingdate date,
+
     apptype     |  type
    prioritydate date,
+
     appnum     |  applicationnumber
    prioritycountry varchar,
+
     gdate      |  grantdate
    prioritypatentnumber varchar,
+
     appdate    |  filingdate
    ussubclass varchar,
 
    maingroup varchar,
 
    subgroup varchar,
 
    cpcsubclass varchar,
 
    cpcmaingroup varchar,
 
    cpcsubgroup varchar,
 
    classificationnationalcountry varchar,
 
    classificationnationalclass varchar,
 
    title varchar,
 
    numberofclaims int,
 
    primaryexaminerfirstname varchar,
 
    primaryexaminerlastname varchar,
 
    primaryexaminerdepartment varchar,
 
    pctpatentnumber varchar,
 
    filename varchar
 
 
 
Combined Schema:
 
 
 
Column Names:
 
patent int,
 
kind varchar,
 
claims int,
 
apptype int,
 
appnum int,
 
gdate date,
 
gyear int,
 
appdate date,
 
appyear int
 
 
 
 
 
    patentnumber int, -- patent
 
    kind varchar, -- kind
 
    grantdate date, --gdate
 
     type varchar,
 
     applicationnumber varchar,
 
    filingdate date,
 
    prioritydate date,
 
    prioritycountry varchar,
 
    prioritypatentnumber varchar,
 
    ussubclass varchar,
 
    maingroup varchar,
 
    subgroup varchar,
 
    cpcsubclass varchar,
 
    cpcmaingroup varchar,
 
    cpcsubgroup varchar,
 
    classificationnationalcountry varchar,
 
    classificationnationalclass varchar,
 
    title varchar,
 
     numberofclaims int,
 
     primaryexaminerfirstname varchar,
 
     primaryexaminerlastname varchar,
 
    primaryexaminerdepartment varchar,
 
    pctpatentnumber varchar,
 
    filename varchar
 
    claims int,
 
    apptype int,
 
     appnum int,
 
     gyear int,
 
     appdate date,
 
     appyear int
 
 
      
 
      
   
 
    Output Schema: patents
 
   
 
      CREATE TABLE patents_merged(
 
        patentnumber int,
 
        kind varchar,
 
        grantdate date,
 
        type varchar,
 
        applicationnumber varchar,
 
        filingdate date,
 
        prioritydate date,
 
        prioritycountry varchar,
 
        prioritypatentnumber varchar,
 
        ussubclass varchar,
 
        maingroup varchar,
 
        subgroup varchar,
 
        cpcsubclass varchar,
 
        cpcmaingroup varchar,
 
        cpcsubgroup varchar,
 
        classificationnationalcountry varchar,
 
        classificationnationalclass varchar,
 
        title varchar,
 
        numberofclaims int,
 
        primaryexaminerfirstname varchar,
 
        primaryexaminerlastname varchar,
 
        primaryexaminerdepartment varchar,
 
        pctpatentnumber varchar,
 
        filename varchar,
 
        claims int,
 
        apptype int,
 
        appnum int,
 
        gyear int,
 
        appdate date,
 
        appyear int
 
        );
 
       
 
   
 
    patentdata:
 
    INSERT INTO patents_merged
 
    (
 
    SELECT
 
    patent,
 
        kind,
 
        gdate,
 
        'NULL',
 
        'NULL',
 
        NULL,
 
        NULL,
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        -1,
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        'NULL',
 
        claims,
 
        apptype,
 
        appnum,
 
        gyear,
 
        appdate,
 
        appyear
 
        FROM patents
 
    );
 
    -- RESULT : INSERT 0 3984771
 
   
 
    patent_2015:
 
    INSERT INTO patents_merged
 
    (
 
    SELECT
 
    patentnumber,
 
kind,
 
grantdate,
 
type,
 
applicationnumber,
 
filingdate,
 
prioritydate,
 
prioritycountry,
 
prioritypatentnumber,
 
ussubclass,
 
maingroup,
 
subgroup,
 
cpcsubclass,
 
cpcmaingroup,
 
cpcsubgroup,
 
classificationnationalcountry,
 
classificationnationalclass,
 
title,
 
numberofclaims,
 
primaryexaminerfirstname,
 
primaryexaminerlastname,
 
primaryexaminerdepartment,
 
pctpatentnumber,
 
filename,
 
-1,
 
-1,
 
-1,
 
-1,
 
NULL,
 
        -1
 
    FROM patents
 
    );
 
    -- RESULT : INSERT 0 1646225
 
   
 
COPY SCRIPTS:
 
patentdata:
 
\COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 
--COPY 3984771
 
  
patent_2015:
+
'''Combined Schema:'''
\COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 
--COPY 1646225
 
  
PATENTS TABLE
+
The final schema of the patents table is :  
\COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 
-- RESULT : COPY 3984771
 
\COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
 
-- RESULT : COPY 1646225
 
  
 +
          Column        |      Type        | Modifiers
 +
  ----------------------+-------------------+-----------
 +
    patent              | integer          | not null
 +
    grantdate            | date              |
 +
    prioritydate        | date              |
 +
    prioritycountry      | character varying |
 +
    prioritypatentnumber | character varying |
 +
    cpcsubgroup          | character varying |
 +
    pctpatentnumber      | character varying |
 +
    claims              | integer          |
 +
    appnum              | integer          |
 +
    gyear                | integer          |
 +
    appdate              | date              |
 +
    appyear              | integer          |
 +
    nber                | integer          |
 +
    uspc                | character varying |
 +
    uspc_sub            | character varying |
 +
 +
From the total list of columns belonging to both the tables (patentdata and patent_2015), a few columns, most of them related to classification of patents, have been dropped since the data in the tables was not clean.
  
TESTING:
+
Additionally, three columns - nber, uspc, uspc_sub have been added from the historicalpatentdata, a table built from data downloaded from the USPTO Bulk Data Storage. The join was executed on the patent number.
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
 
--RESULT:  5411151
 
EXPECTED: 5426566
 
  
SELECT COUNT(*), *
+
  Note : The addition, deletion of columns as through separate [[Patent Data Cleanup - June 2016 |scripts]], therefore the scripts below will be slightly discrepant.
FROM patents
+
       
GROUP BY
+
==== Index and Key Creation ====
patentnumber,
+
Patent numbers are distinct in this table, and are central to the rest of the fields in the table. A primary key can therefore be imposed on the column. Also, since a number of searches are likely to be conducted on this table, an index has been imposed as well.
kind,
 
grantdate,
 
type,
 
applicationnumber,
 
filingdate,
 
prioritydate,
 
prioritycountry,
 
prioritypatentnumber,
 
ussubclass,
 
maingroup,
 
subgroup,
 
cpcsubclass,
 
cpcmaingroup,
 
cpcsubgroup,
 
classificationnationalcountry,
 
classificationnationalclass,
 
title,
 
numberofclaims,
 
primaryexaminerfirstname,
 
primaryexaminerlastname,
 
primaryexaminerdepartment,
 
pctpatentnumber,
 
filename,
 
claims,
 
apptype,
 
appnum,
 
gyear,
 
appdate,
 
appyear
 
HAVING COUNT(*) > 1;
 
  
SELECT patentnumber, count(*)
+
Code:
FROM patents
+
  ALTER TABLE patents ADD PRIMARY KEY (patentnumber);
GROUP BY patentnumber
+
  -- RESULT : ALTER TABLE
HAVING count(*)>1;
+
  allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);
--7640598
 
  
 
SELECT *
 
FROM patents op
 
WHERE op.patentnumber IN
 
(
 
SELECT ip.patentnumber
 
FROM patents ip
 
GROUP BY ip.patentnumber
 
HAVING COUNT(*)>1
 
)
 
ORDER BY op.patentnumber;
 
 
(
 
SELECT *
 
INTO patentsCleaned
 
FROM patents op
 
WHERE op.patentnumber IN
 
(
 
SELECT ip.patentnumber
 
FROM patents ip
 
GROUP BY ip.patentnumber
 
HAVING COUNT(*)=1
 
)
 
ORDER BY op.patentnumber
 
)
 
--SELECT 5191306
 
 
 
INSERT INTO patentsCleaned(
 
SELECT *
 
FROM patents op
 
WHERE op.patentnumber IN
 
(
 
SELECT ip.patentnumber
 
FROM patents ip
 
GROUP BY ip.patentnumber
 
HAVING COUNT(*)>1
 
)
 
AND op.applicationnumber NOT LIKE 'NULL'
 
ORDER BY op.patentnumber
 
);
 
 
--219845
 
 
TESTING:
 
allpatent=# select count(*) from patentsCleaned;
 
  count
 
---------
 
5411151
 
(1 row)
 
 
allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
 
count | patentnumber
 
-------+--------------
 
(0 rows)
 
 
 
********** INDEX CREATION **************
 
 
ALTER TABLE patents ADD PRIMARY KEY (patentnumber);
 
-- RESULT : ALTER TABLE
 
allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);
 
 
allpatent=# CREATE INDEX ON assignees (orgname);
 
CREATE INDEX
 
 
 
 
 
 
 
**Patents
 
 
Patentdata:
 
Column  |      Type        | Modifiers
 
---------+-------------------+-----------
 
patent  | integer          |
 
kind    | character varying |
 
claims  | integer          |
 
apptype | integer          |
 
appnum  | integer          |
 
gdate  | date              |
 
gyear  | integer          |
 
appdate | date              |
 
appyear | integer          |
 
 
Column Names:
 
patent int,
 
kind varchar,
 
claims int,
 
apptype int,
 
appnum int,
 
gdate date,
 
gyear int,
 
appdate date,
 
appyear int
 
 
 
 
 
Patent_2015:
 
            Column            |  Type  | Modifiers
 
-------------------------------+---------+-----------
 
  patentnumber                  | int | not null
 
  kind                          | varchar    |
 
  grantdate                    | date    |
 
  type                          | varchar    |
 
  applicationnumber            | varchar    |
 
  filingdate                    | date    |
 
  prioritydate                  | date    |
 
  prioritycountry              | varchar    |
 
  prioritypatentnumber          | varchar    |
 
  ussubclass                    | varchar    |
 
  maingroup                    | varchar    |
 
  subgroup                      | varchar    |
 
  cpcsubclass                  | varchar    |
 
  cpcmaingroup                  | varchar    |
 
  cpcsubgroup                  | varchar    |
 
  classificationnationalcountry | varchar    |
 
  classificationnationalclass  | varchar    |
 
  title                        | varchar    |
 
  numberofclaims                | int |
 
  primaryexaminerfirstname      | varchar    |
 
  primaryexaminerlastname      | varchar    |
 
  primaryexaminerdepartment    | varchar    |
 
  pctpatentnumber              | varchar    |
 
  filename                      | varchar    |
 
 
 
  Column Names:
 
    patentnumber int,
 
    kind varchar,
 
    grantdate date,
 
    type varchar,
 
    applicationnumber varchar,
 
    filingdate date,
 
    prioritydate date,
 
    prioritycountry varchar,
 
    prioritypatentnumber varchar,
 
    ussubclass varchar,
 
    maingroup varchar,
 
    subgroup varchar,
 
    cpcsubclass varchar,
 
    cpcmaingroup varchar,
 
    cpcsubgroup varchar,
 
    classificationnationalcountry varchar,
 
    classificationnationalclass varchar,
 
    title varchar,
 
    numberofclaims int,
 
    primaryexaminerfirstname varchar,
 
    primaryexaminerlastname varchar,
 
    primaryexaminerdepartment varchar,
 
    pctpatentnumber varchar,
 
    filename varchar
 
 
Combined Schema:
 
 
Column Names:
 
patent int,
 
kind varchar,
 
claims int,
 
apptype int,
 
appnum int,
 
gdate date,
 
gyear int,
 
appdate date,
 
appyear int
 
  
  
    patentnumber int, -- patent
+
====Sample Insert and Copy Statements====
    kind varchar, -- kind
 
    grantdate date, --gdate
 
    type varchar,
 
    applicationnumber varchar,
 
    filingdate date,
 
    prioritydate date,
 
    prioritycountry varchar,
 
    prioritypatentnumber varchar,
 
    ussubclass varchar,
 
    maingroup varchar,
 
    subgroup varchar,
 
    cpcsubclass varchar,
 
    cpcmaingroup varchar,
 
    cpcsubgroup varchar,
 
    classificationnationalcountry varchar,
 
    classificationnationalclass varchar,
 
    title varchar,
 
    numberofclaims int,
 
    primaryexaminerfirstname varchar,
 
    primaryexaminerlastname varchar,
 
    primaryexaminerdepartment varchar,
 
    pctpatentnumber varchar,
 
    filename varchar
 
    claims int,
 
    apptype int,
 
    appnum int,
 
    gyear int,
 
    appdate date,
 
    appyear int
 
   
 
   
 
    Output Schema: patents
 
   
 
      CREATE TABLE patents_merged(
 
        patentnumber int,
 
        kind varchar,
 
        grantdate date,
 
        type varchar,
 
        applicationnumber varchar,
 
        filingdate date,
 
        prioritydate date,
 
        prioritycountry varchar,
 
        prioritypatentnumber varchar,
 
        ussubclass varchar,
 
        maingroup varchar,
 
        subgroup varchar,
 
        cpcsubclass varchar,
 
        cpcmaingroup varchar,
 
        cpcsubgroup varchar,
 
        classificationnationalcountry varchar,
 
        classificationnationalclass varchar,
 
        title varchar,
 
        numberofclaims int,
 
        primaryexaminerfirstname varchar,
 
        primaryexaminerlastname varchar,
 
        primaryexaminerdepartment varchar,
 
        pctpatentnumber varchar,
 
        filename varchar,
 
        claims int,
 
        apptype int,
 
        appnum int,
 
        gyear int,
 
        appdate date,
 
        appyear int
 
        );
 
       
 
   
 
 
     patentdata:
 
     patentdata:
 
     INSERT INTO patents_merged
 
     INSERT INTO patents_merged
 
     (
 
     (
    SELECT
+
    SELECT
    patent,  
+
      patent,  
        kind,  
+
          kind,  
        gdate,
+
          gdate,  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        NULL,
+
          NULL,  
        NULL,
+
          NULL,  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        -1,
+
        -1,  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        'NULL',
+
        'NULL',  
        claims,
+
        claims,  
        apptype,
+
        apptype,  
        appnum,
+
        appnum,  
        gyear,
+
        gyear,  
        appdate,
+
        appdate,  
        appyear
+
        appyear  
        FROM patents
+
        FROM patents  
     );
+
     );  
     -- RESULT : INSERT 0 3984771
+
     -- RESULT : INSERT 0 3984771  
 
      
 
      
 
     patent_2015:
 
     patent_2015:
Line 799: Line 320:
 
COPY SCRIPTS:
 
COPY SCRIPTS:
 
patentdata:
 
patentdata:
\COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--COPY 3984771
+
  --COPY 3984771
  
 
patent_2015:
 
patent_2015:
\COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--COPY 1646225
+
  --COPY 1646225
  
 
PATENTS TABLE
 
PATENTS TABLE
\COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
-- RESULT : COPY 3984771
+
  -- RESULT : COPY 3984771
\COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
-- RESULT : COPY 1646225
+
  -- RESULT : COPY 1646225
  
 +
====TESTING ====
 +
  select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
 +
  --RESULT:  5411151
 +
  EXPECTED: 5426566
  
TESTING:
+
We found some copies of a few rows, where both the patent_2015 and patentdata
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
 
--RESULT:  5411151
 
EXPECTED: 5426566
 
  
SELECT COUNT(*), *
+
  SELECT COUNT(*), *
FROM patents
+
  FROM patents
GROUP BY
+
  GROUP BY
patentnumber,  
+
  patentnumber,  
 
kind,  
 
kind,  
 
grantdate,
 
grantdate,
Line 851: Line 373:
 
appdate,
 
appdate,
 
appyear
 
appyear
HAVING COUNT(*) > 1;
+
  HAVING COUNT(*) > 1;
  
SELECT patentnumber, count(*)
+
  SELECT patentnumber, count(*)
FROM patents
+
  FROM patents
GROUP BY patentnumber
+
  GROUP BY patentnumber
HAVING count(*)>1;
+
  HAVING count(*)>1;
--7640598  
+
  --7640598  
  
  
SELECT *  
+
  SELECT *  
FROM patents op
+
  FROM patents op
WHERE op.patentnumber IN
+
  WHERE op.patentnumber IN
 
(
 
(
 
SELECT ip.patentnumber  
 
SELECT ip.patentnumber  
Line 869: Line 391:
 
HAVING COUNT(*)>1
 
HAVING COUNT(*)>1
 
)
 
)
ORDER BY op.patentnumber;
+
  ORDER BY op.patentnumber;
  
(
+
  (
SELECT *  
+
  SELECT *  
INTO patentsCleaned
+
  INTO patentsCleaned
FROM patents op
+
  FROM patents op
WHERE op.patentnumber IN
+
  WHERE op.patentnumber IN
 
(
 
(
 
SELECT ip.patentnumber  
 
SELECT ip.patentnumber  
Line 882: Line 404:
 
HAVING COUNT(*)=1
 
HAVING COUNT(*)=1
 
)
 
)
ORDER BY op.patentnumber
+
  ORDER BY op.patentnumber
)
+
  )
--SELECT 5191306
+
  --SELECT 5191306
  
  
INSERT INTO patentsCleaned(
+
  INSERT INTO patentsCleaned(
SELECT *  
+
  SELECT *  
FROM patents op
+
  FROM patents op
WHERE op.patentnumber IN
+
  WHERE op.patentnumber IN
 
(
 
(
 
SELECT ip.patentnumber  
 
SELECT ip.patentnumber  
Line 897: Line 419:
 
HAVING COUNT(*)>1
 
HAVING COUNT(*)>1
 
)
 
)
AND op.applicationnumber NOT LIKE 'NULL'
+
  AND op.applicationnumber NOT LIKE 'NULL'
ORDER BY op.patentnumber
+
  ORDER BY op.patentnumber
);
+
  );
 
 
--219845
 
 
 
TESTING:
 
allpatent=# select count(*) from patentsCleaned;
 
  count
 
---------
 
5411151
 
(1 row)
 
  
allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
+
  --219845
count | patentnumber
 
-------+--------------
 
(0 rows)
 
  
 +
====TESTING:====
 +
  allpatent=# select count(*) from patentsCleaned;
 +
    count
 +
  ---------
 +
  5411151
 +
  (1 row)
  
********** INDEX CREATION **************
+
  allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
 +
  count | patentnumber
 +
  -------+--------------
 +
  (0 rows)
  
ALTER TABLE patents ADD PRIMARY KEY (patentnumber);
+
== Citations==
-- RESULT : ALTER TABLE
 
allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);
 
  
allpatent=# CREATE INDEX ON assignees (orgname);
+
In the citations table, we needed to define another function that would convert a textual patent number into a number (big int, since the patents number were exceeding the range of regular integers.)
CREATE INDEX
 
 
 
 
 
 
 
 
 
 
 
**Citations
 
  
 
To Extract Patents with Numbers Only and to Ignore Other RegExes
 
To Extract Patents with Numbers Only and to Ignore Other RegExes
CREATE OR REPLACE FUNCTION cleanpatno (text) RETURNS bigint AS $$
+
  CREATE OR REPLACE FUNCTION cleanpatno (text) RETURNS bigint AS $$
 
if ($_[0]) {
 
if ($_[0]) {
 
my $var=$_[0];
 
my $var=$_[0];
Line 942: Line 452:
  
  
************ Columns *********************
 
patentdata:
 
Column    |      Type        | Modifiers
 
-------------+-------------------+-----------
 
patent      | integer          |
 
cit_date    | date              |
 
cit_name    | character varying |
 
cit_kind    | character varying |
 
cit_country | character varying |
 
citation    | integer          |
 
category    | character varying |
 
citseq      | integer          |
 
  
SELECT patent as citingpatentnumber, citation AS citedpatentnumber
+
'''patentdata schema:'''
INTO citations_merged
+
 
FROM citations;
+
    Column    |      Type        | Modifiers
--SELECT 38452957
+
  ------------+-------------------+-----------
 +
  patent    | integer          |
 +
  cit_date    | date              |
 +
  cit_name    | character varying |
 +
  cit_kind    | character varying |
 +
  cit_country | character varying |
 +
  citation    | integer          |
 +
  category    | character varying |
 +
  citseq      | integer          |
 +
 
 +
  SELECT patent as citingpatentnumber, citation AS citedpatentnumber
 +
  INTO citations_merged
 +
  FROM citations;
 +
  --SELECT 38452957
 +
 
 +
'''patent_2015 schema:'''
 +
        Column        |  Type  | Modifiers
 +
  ---------------------+---------+-----------
 +
  citingpatentnumber  | integer |
 +
  citingpatentcountry | text    |
 +
  citedpatentnumber  | text    |
 +
  citedpatentcountry  | text    |
  
patent_2015:
+
   SELECT CAST(citingpatentnumber AS bigint), CAST(cleanpatno( citedpatentnumber) AS bigint) as citedpatentnumber
Column        |  Type   | Modifiers
+
  INTO citations_merged
---------------------+---------+-----------
+
   FROM citations;
citingpatentnumber | integer |
+
  -- RESULT : SELECT 59227881
citingpatentcountry | text    |
 
citedpatentnumber   | text    |
 
citedpatentcountry  | text    |
 
  
SELECT CAST(citingpatentnumber AS bigint), CAST(cleanpatno( citedpatentnumber) AS bigint) as citedpatentnumber
+
'''Overlapping Columns'''
INTO citations_merged
 
FROM citations;
 
-- RESULT : SELECT 59227881
 
  
FINAL TABLE:
+
      patent_2015    |  patentdata  |
 +
  ---------------------+---------------+
 +
  citingpatentnumber  | patent        |
 +
  citedpatentnumber  | citation      |
 +
 
 +
''' Combined Schema:'''
  
CREATE TABLE citations
+
      Column      |  Type  | Modifiers
(
+
--------------------+--------+-----------
citingpatentnumber bigint,
+
citingpatentnumber | bigint |
citedpatentnumber bigint
+
citedpatentnumber | bigint |
);
 
  
  
Line 985: Line 502:
  
 
patentdata:
 
patentdata:
\COPY citations_merged TO '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY citations_merged TO '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--COPY 38452957
+
  --COPY 38452957
  
 
patent_2015:
 
patent_2015:
\COPY citations_merged TO '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY citations_merged TO '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
-- RESULT : COPY 59227881
+
  -- RESULT : COPY 59227881
  
 
allpatent:
 
allpatent:
\COPY citations FROM '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY citations FROM '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--RESULT : COPY 59227881
+
  --RESULT : COPY 59227881
  
\COPY citations FROM '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
  \COPY citations FROM '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--RESULT: COPY 38452957
+
  --RESULT: COPY 38452957
  
  
Line 1,005: Line 522:
 
== USPTO Consolidated Patent Data ==
 
== USPTO Consolidated Patent Data ==
  
Scripts:
+
The USPTO has a repository of patent data on their Bulk Data Storage system. We have this data downloaded and loaded into a table on the patent database. Here are the steps followed:
 +
* Download file from the BDS system - we have access to CSV files.
 +
* Create table with required specs
 +
* use the \COPY command to copy the data from the file into the table.
 +
 
 +
Script follows.
 +
 
 +
'''Script:'''
 +
 
 +
/* creating patent data tables from : https://bulkdata.uspto.gov/data2/patent/maintenancefee/*/
  
/* creating patent data tables from : https://bulkdata.uspto.gov/data2/patent/maintenancefee/*/
+
CREATE TABLE PatentMaintenanceFee(
 +
patentnumber varchar,
 +
applicationnumber int,
 +
smallentity varchar,
 +
filingdate date,
 +
grantissuedate date,
 +
maintenancefeedate date,
 +
maintenancefeecode varchar
 +
);
  
CREATE TABLE PatentMaintenanceFee(
+
\COPY PatentMaintenanceFee FROM '/bulk/USPTO_Consolidated/MaintFeeEvents_20160613.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
patentnumber varchar,
+
-- RESULT : COPY 14042059
applicationnumber int,
 
smallentity varchar,
 
filingdate date,
 
grantissuedate date,
 
maintenancefeedate date,
 
maintenancefeecode varchar
 
);
 
  
\COPY PatentMaintenanceFee FROM '/bulk/USPTO_Consolidated/MaintFeeEvents_20160613.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
+
/* creating tables for historical patent data - USPTO */  
-- RESULT : COPY 14042059
 
  
/* creating tables for historical patent data - USPTO */
+
CREATE TABLE HistoricalPatentData(
 +
applicationid int,
 +
pubno varchar,
 +
patentnumber varchar,
 +
NBER int,
 +
USPC varchar,
 +
USPC_sub varchar,
 +
applicationdate date,
 +
prioritydate date,
 +
pubdate date,
 +
displaydate date,
 +
disptype varchar,
 +
exp_dt date,
 +
exp_dt_max date,
 +
pta int
 +
);
  
CREATE TABLE HistoricalPatentData(
+
\COPY historicalpatentdata FROM '/bulk/USPTO_Consolidated/HistoricalFiles/historical_masterfile.csv' DELIMITER AS ',' HEADER NULL AS '' CSV;
applicationid int,
 
pubno varchar,
 
patentnumber varchar,
 
NBER int,
 
USPC varchar,
 
USPC_sub varchar,
 
applicationdate date,
 
prioritydate date,
 
pubdate date,
 
displaydate date,
 
disptype varchar,
 
exp_dt date,
 
exp_dt_max date,
 
pta int
 
);
 
  
\COPY historicalpatentdata FROM '/bulk/USPTO_Consolidated/HistoricalFiles/historical_masterfile.csv' DELIMITER AS ',' HEADER NULL AS '' CSV;
+
--COPY 11191813
  
--COPY 11191813
+
[[Category:Internal]]
 +
[[Internal Classification::Legacy| ]]
 +
[[Category:Patent]]

Latest revision as of 13:34, 21 September 2020


Project
Patent Data Processing - SQL Steps
Project logo 02.png
Project Information
Has title Patent Data Processing - SQL Steps
Has owner Shelby Bice
Has start date
Has deadline date
Has keywords Tool
Has project status Subsume
Has sponsor McNair Center
Copyright © 2019 edegan.com. All Rights Reserved.

Return to Patent Data (Wiki Page).

Objective

The McNair Center owns two sets of patent data - one set that is inherited from Harvard, the Harvard dataverse, which is stored in the database patentdata and another that is generated by crawlers pulling data from the USPTO website, which is stored in the database patent_2015.

We are now merging and cleaning the two data sets, and storing them in a schema that is amalgamation of the two underlying schema for the citations tables, assignees tables, and patents tables. The destination schema is allpatent.

Assignees Data

The schema for the assignees table in patentdata database is:

   Column    |       Type        | Modifiers
-------------+-------------------+-----------
 patent      | integer           |
 asgtype     | integer           |
 assignee    | character varying |
 city        | character varying |
 state       | character varying |
 country     | character varying |
 nationality | character varying |
 residence   | character varying |
 asgseq      | integer           |

The schema for the assignees table in patent_2015 is :

   Column     |  Type   | Modifiers
--------------+---------+-----------
lastname      | text    |
firstname     | text    |
orgname       | text    |
city          | text    |
country       | text    |
patentcountry | text    |
patentnumber  | integer |
state         | text    |
address       | text    |
postcode      | text    |

To merge both schemas, we have some columns that overlap, and some columns that don't.

Overlapping Columns

 patent_2015  | patentdata
--------------+--------------
 orgname      | assignee 
 city         | city 
 country      | country
 patentnumber | patent 
 state        | state

These columns will have entries for most rows in the table, because they exist in both tables. The rest of the columns will be populated based on which table the row is coming from.

Final Schema

        Table "public.assignees"
    Column     |       Type        | Modifiers
---------------+-------------------+-----------
 lastname      | character varying |
 firstname     | character varying |
 address       | character varying |
 postcode      | character varying |
 orgname       | character varying |
 city          | character varying |
 country       | character varying |
 patentnumber  | integer           |
 state         | character varying |
 patentcountry | character varying |
 nationality2  | character varying |
 residence     | character varying |
 asgseq        | integer           |
 asgtype       | integer           |

Non-overlapping Columns These are the columns that belong to either one of the assignees tables, and not to both. For these cases, to help users understand where the row is coming from, the following insert rules have been followed:

  • For columns of type int, insert -1
  • For columns of type string (character varying), the string 'null' has been inserted.

Therefore, if a row has appropriate values for orgname, state, city ,etc, but 'null' values for lastname, firstname, address and postcode, the row has come from the patentdata table.

Index

Since the table is relatively large, and is likely to be searched often, an index has been imposed on the table.

 allpatent=# CREATE INDEX ON assignees (orgname);
 CREATE INDEX

Sample insert and copy commands

INSERT INTO assignees_merge ( SELECT 'null', 'null', 'null', 'null', a.assignee, a.city, a.country, a.patent, a.state, 'null', a.nationality, a.residence, a.asgseq, a.asgtype FROM assignees a );

INSERT INTO assignees_merge ( SELECT assignees.lastname, assignees.firstname, assignees.address, assignees.postcode, assignees.orgname, assignees.city, assignees.country, assignees.patentnumber, assignees.state, assignees.patentcountry, 'null', 'null', -1, -1 FROM assignees );


\COPY assignees_merge TO '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; \COPY assignees FROM '/tmp/assignees_merge_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --1607724

\COPY assignees_merge TO '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; \COPY assignees FROM '/tmp/assignees_merge_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --3818842

Note : The assignees table was updated on 6/23 to remove the 'null' string and the '-1' values.

Patents

Patentdata Schema:

Column  |       Type        | Modifiers
--------+-------------------+-----------
patent  | integer           |
kind    | character varying |
claims  | integer           |
apptype | integer           |
appnum  | integer           |
gdate   | date              |
gyear   | integer           |
appdate | date              |
appyear | integer           |


Patent_2015 Schema:

            Column             |  Type   | Modifiers
-------------------------------+---------+-----------
 patentnumber                  | int | not null
 kind                          | varchar    |
 grantdate                     | date    |
 type                          | varchar    |
 applicationnumber             | varchar    |
 filingdate                    | date    |
 prioritydate                  | date    |
 prioritycountry               | varchar    |
 prioritypatentnumber          | varchar    |
 ussubclass                    | varchar    |
 maingroup                     | varchar    |
 subgroup                      | varchar    |
 cpcsubclass                   | varchar    |
 cpcmaingroup                  | varchar    |
 cpcsubgroup                   | varchar    |
 classificationnationalcountry | varchar    |
 classificationnationalclass   | varchar    |
 title                         | varchar    |
 numberofclaims                | int |
 primaryexaminerfirstname      | varchar    |
 primaryexaminerlastname       | varchar    |
 primaryexaminerdepartment     | varchar    |
 pctpatentnumber               | varchar    |
 filename                      | varchar    |
 

Overlapping Columns

  patent_data     patent_2015
 --------------+-------------
   patent      |  patentnumber
   kind        |  kind
   claims      |  numberofclaims
   apptype     |  type
   appnum      |  applicationnumber
   gdate       |  grantdate
   appdate     |  filingdate
   

Combined Schema:

The final schema of the patents table is :

          Column        |       Type        | Modifiers
  ----------------------+-------------------+-----------
   patent               | integer           | not null
   grantdate            | date              |
   prioritydate         | date              |
   prioritycountry      | character varying |
   prioritypatentnumber | character varying |
   cpcsubgroup          | character varying |
   pctpatentnumber      | character varying |
   claims               | integer           |
   appnum               | integer           |
   gyear                | integer           |
   appdate              | date              |
   appyear              | integer           |
   nber                 | integer           |
   uspc                 | character varying |
   uspc_sub             | character varying |

From the total list of columns belonging to both the tables (patentdata and patent_2015), a few columns, most of them related to classification of patents, have been dropped since the data in the tables was not clean.

Additionally, three columns - nber, uspc, uspc_sub have been added from the historicalpatentdata, a table built from data downloaded from the USPTO Bulk Data Storage. The join was executed on the patent number.

 Note : The addition, deletion of columns as through separate scripts, therefore the scripts below will be slightly discrepant.
       

Index and Key Creation

Patent numbers are distinct in this table, and are central to the rest of the fields in the table. A primary key can therefore be imposed on the column. Also, since a number of searches are likely to be conducted on this table, an index has been imposed as well.

Code:

 ALTER TABLE patents ADD PRIMARY KEY (patentnumber);
 -- RESULT : ALTER TABLE
 allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);


Sample Insert and Copy Statements

   patentdata:
   INSERT INTO patents_merged
   (
   		 SELECT	
   		  patent, 

kind, gdate, 'NULL', 'NULL', NULL, NULL, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', -1, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', claims, apptype, appnum, gyear, appdate,

       	 appyear 
       	 FROM patents 
   ); 
   -- RESULT : INSERT 0 3984771 
   
   patent_2015:
   INSERT INTO patents_merged
   (
   	SELECT
   		patentnumber, 

kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, -1, -1, -1, -1, NULL,

       	-1
   	FROM patents
   );
   -- RESULT : INSERT 0 1646225
   

COPY SCRIPTS: patentdata:

 \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 --COPY 3984771

patent_2015:

 \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 --COPY 1646225

PATENTS TABLE

 \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 -- RESULT : COPY 3984771
 \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 -- RESULT : COPY 1646225

TESTING

 select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
 --RESULT:  5411151
  EXPECTED: 5426566

We found some copies of a few rows, where both the patent_2015 and patentdata

 SELECT COUNT(*), *
 FROM patents
 GROUP BY
 	patentnumber, 

kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, claims, apptype, appnum, gyear, appdate, appyear

 HAVING COUNT(*) > 1;
 SELECT patentnumber, count(*)
 FROM patents
 GROUP BY patentnumber
 HAVING count(*)>1;
 --7640598 


 SELECT * 
 FROM patents op
 WHERE op.patentnumber IN

( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )

 ORDER BY op.patentnumber;
 (
 SELECT * 
 INTO patentsCleaned
 FROM patents op
 WHERE op.patentnumber IN

( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)=1 )

 ORDER BY op.patentnumber
 )
 --SELECT 5191306


 INSERT INTO patentsCleaned(
 SELECT * 
 FROM patents op
 WHERE op.patentnumber IN

( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )

 AND op.applicationnumber NOT LIKE 'NULL'
 ORDER BY op.patentnumber
 );
 --219845

TESTING:

 allpatent=# select count(*) from patentsCleaned;
   count
 ---------
  5411151
 (1 row)
 allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
  count | patentnumber
 -------+--------------
 (0 rows)

Citations

In the citations table, we needed to define another function that would convert a textual patent number into a number (big int, since the patents number were exceeding the range of regular integers.)

To Extract Patents with Numbers Only and to Ignore Other RegExes

 CREATE OR REPLACE FUNCTION cleanpatno (text) RETURNS bigint AS $$

if ($_[0]) { my $var=$_[0]; if ($var=~/^\d*$/) {return $var;} return undef; } return undef; $$ LANGUAGE plperl;


patentdata schema:

   Column    |       Type        | Modifiers
 ------------+-------------------+-----------
  patent     | integer           | 
 cit_date    | date              |
 cit_name    | character varying |
 cit_kind    | character varying |
 cit_country | character varying |
 citation    | integer           |
 category    | character varying |
 citseq      | integer           | 
 SELECT patent as citingpatentnumber, citation AS citedpatentnumber
 INTO citations_merged
 FROM citations;
 --SELECT 38452957

patent_2015 schema:

        Column        |  Type   | Modifiers
 ---------------------+---------+-----------
  citingpatentnumber  | integer |
  citingpatentcountry | text    |
  citedpatentnumber   | text    |
  citedpatentcountry  | text    |
 SELECT CAST(citingpatentnumber AS bigint), CAST(cleanpatno( citedpatentnumber) AS bigint) as citedpatentnumber
 INTO citations_merged
 FROM citations;
 -- RESULT : SELECT 59227881

Overlapping Columns

      patent_2015     |  patentdata   |
 ---------------------+---------------+
  citingpatentnumber  | patent        |
  citedpatentnumber   | citation      |
 

Combined Schema:

      Column       |  Type  | Modifiers

+--------+-----------

citingpatentnumber | bigint |
citedpatentnumber  | bigint |


Copy Statements:

patentdata:

 \COPY citations_merged TO '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 --COPY 38452957

patent_2015:

 \COPY citations_merged TO '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 -- RESULT : COPY 59227881

allpatent:

 \COPY citations FROM '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 --RESULT : COPY 59227881
 \COPY citations FROM '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
 --RESULT: COPY 38452957


CLONING: CREATE DATABASE allpatentsProcessed WITH TEMPLATE allpatent OWNER researcher;

USPTO Consolidated Patent Data

The USPTO has a repository of patent data on their Bulk Data Storage system. We have this data downloaded and loaded into a table on the patent database. Here are the steps followed:

  • Download file from the BDS system - we have access to CSV files.
  • Create table with required specs
  • use the \COPY command to copy the data from the file into the table.

Script follows.

Script:

/* creating patent data tables from : https://bulkdata.uspto.gov/data2/patent/maintenancefee/*/
CREATE TABLE PatentMaintenanceFee(
patentnumber varchar,
applicationnumber int,
smallentity varchar,
filingdate date,
grantissuedate date,
maintenancefeedate date,
maintenancefeecode varchar
);
\COPY PatentMaintenanceFee FROM '/bulk/USPTO_Consolidated/MaintFeeEvents_20160613.txt' DELIMITER AS E'\t' HEADER NULL AS  CSV;
-- RESULT : COPY 14042059
/* creating tables for historical patent data - USPTO */ 
CREATE TABLE HistoricalPatentData(
applicationid int,
pubno varchar,
patentnumber varchar,
NBER int,
USPC varchar,
USPC_sub varchar,
applicationdate date,
prioritydate date,
pubdate date,
displaydate date,
disptype varchar,
exp_dt date,
exp_dt_max date,
pta int
);
\COPY historicalpatentdata FROM '/bulk/USPTO_Consolidated/HistoricalFiles/historical_masterfile.csv' DELIMITER AS ',' HEADER NULL AS  CSV;
--COPY 11191813