Changes

Jump to navigation Jump to search
==Patents ==
 
Patentdata:
Column | Type | Modifiers
---------+-------------------+-----------
patent | integer |
kind | character varying |
claims | integer |
apptype | integer |
appnum | integer |
gdate | date |
gyear | integer |
appdate | date |
appyear | integer |
 
Patent_2015:
Column | Type | Modifiers
-------------------------------+---------+-----------
patentnumber | int | not null
kind | varchar |
grantdate | date |
type | varchar |
applicationnumber | varchar |
filingdate | date |
prioritydate | date |
prioritycountry | varchar |
prioritypatentnumber | varchar |
ussubclass | varchar |
maingroup | varchar |
subgroup | varchar |
cpcsubclass | varchar |
cpcmaingroup | varchar |
cpcsubgroup | varchar |
classificationnationalcountry | varchar |
classificationnationalclass | varchar |
title | varchar |
numberofclaims | int |
primaryexaminerfirstname | varchar |
primaryexaminerlastname | varchar |
primaryexaminerdepartment | varchar |
pctpatentnumber | varchar |
filename | varchar |
Combined Schema:
 
Column Names:
patent int,
kind varchar,
claims int,
apptype int,
appnum int,
gdate date,
gyear int,
appdate date,
appyear int
patentnumber int, -- patent
kind varchar, -- kind
grantdate date, --gdate
type varchar,
applicationnumber varchar,
filingdate date,
prioritydate date,
prioritycountry varchar,
prioritypatentnumber varchar,
ussubclass varchar,
maingroup varchar,
subgroup varchar,
cpcsubclass varchar,
cpcmaingroup varchar,
cpcsubgroup varchar,
classificationnationalcountry varchar,
classificationnationalclass varchar,
title varchar,
numberofclaims int,
primaryexaminerfirstname varchar,
primaryexaminerlastname varchar,
primaryexaminerdepartment varchar,
pctpatentnumber varchar,
filename varchar
claims int,
apptype int,
appnum int,
gyear int,
appdate date,
appyear int
Output Schema: patents
CREATE TABLE patents_merged(
patentnumber int,
kind varchar,
grantdate date,
type varchar,
applicationnumber varchar,
filingdate date,
prioritydate date,
prioritycountry varchar,
prioritypatentnumber varchar,
ussubclass varchar,
maingroup varchar,
subgroup varchar,
cpcsubclass varchar,
cpcmaingroup varchar,
cpcsubgroup varchar,
classificationnationalcountry varchar,
classificationnationalclass varchar,
title varchar,
numberofclaims int,
primaryexaminerfirstname varchar,
primaryexaminerlastname varchar,
primaryexaminerdepartment varchar,
pctpatentnumber varchar,
filename varchar,
claims int,
apptype int,
appnum int,
gyear int,
appdate date,
appyear int
);
patentdata:
INSERT INTO patents_merged
(
SELECT
patent,
kind,
gdate,
'NULL',
'NULL',
NULL,
NULL,
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
-1,
'NULL',
'NULL',
'NULL',
'NULL',
'NULL',
claims,
apptype,
appnum,
gyear,
appdate,
appyear
FROM patents
);
-- RESULT : INSERT 0 3984771
patent_2015:
INSERT INTO patents_merged
(
SELECT
patentnumber,
kind,
grantdate,
type,
applicationnumber,
filingdate,
prioritydate,
prioritycountry,
prioritypatentnumber,
ussubclass,
maingroup,
subgroup,
cpcsubclass,
cpcmaingroup,
cpcsubgroup,
classificationnationalcountry,
classificationnationalclass,
title,
numberofclaims,
primaryexaminerfirstname,
primaryexaminerlastname,
primaryexaminerdepartment,
pctpatentnumber,
filename,
-1,
-1,
-1,
-1,
NULL,
-1
FROM patents
);
-- RESULT : INSERT 0 1646225
COPY SCRIPTS:
patentdata:
\COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--COPY 3984771
 
patent_2015:
\COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
--COPY 1646225
 
PATENTS TABLE
\COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
-- RESULT : COPY 3984771
\COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;
-- RESULT : COPY 1646225
 
 
TESTING:
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
--RESULT: 5411151
EXPECTED: 5426566
 
SELECT COUNT(*), *
FROM patents
GROUP BY
patentnumber,
kind,
grantdate,
type,
applicationnumber,
filingdate,
prioritydate,
prioritycountry,
prioritypatentnumber,
ussubclass,
maingroup,
subgroup,
cpcsubclass,
cpcmaingroup,
cpcsubgroup,
classificationnationalcountry,
classificationnationalclass,
title,
numberofclaims,
primaryexaminerfirstname,
primaryexaminerlastname,
primaryexaminerdepartment,
pctpatentnumber,
filename,
claims,
apptype,
appnum,
gyear,
appdate,
appyear
HAVING COUNT(*) > 1;
 
SELECT patentnumber, count(*)
FROM patents
GROUP BY patentnumber
HAVING count(*)>1;
--7640598
 
 
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
ORDER BY op.patentnumber;
 
(
SELECT *
INTO patentsCleaned
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)=1
)
ORDER BY op.patentnumber
)
--SELECT 5191306
 
 
INSERT INTO patentsCleaned(
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
AND op.applicationnumber NOT LIKE 'NULL'
ORDER BY op.patentnumber
);
 
--219845
 
TESTING:
allpatent=# select count(*) from patentsCleaned;
count
---------
5411151
(1 row)
 
allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
count | patentnumber
-------+--------------
(0 rows)
 
==== INDEX CREATION ====
allpatent=# CREATE INDEX ON assignees (orgname);
CREATE INDEX
 
 
 
 
 
 
**Patents
Patentdata:
appyear | integer |
Column Names:
patent int,
kind varchar,
claims int,
apptype int,
appnum int,
gdate date,
gyear int,
appdate date,
appyear int
 
 
 
Patent_2015:
Column | Type | Modifiers
filename | varchar |
Column Names:
patentnumber int,
kind varchar,
grantdate date,
type varchar,
applicationnumber varchar,
filingdate date,
prioritydate date,
prioritycountry varchar,
prioritypatentnumber varchar,
ussubclass varchar,
maingroup varchar,
subgroup varchar,
cpcsubclass varchar,
cpcmaingroup varchar,
cpcsubgroup varchar,
classificationnationalcountry varchar,
classificationnationalclass varchar,
title varchar,
numberofclaims int,
primaryexaminerfirstname varchar,
primaryexaminerlastname varchar,
primaryexaminerdepartment varchar,
pctpatentnumber varchar,
filename varchar
Combined Schema:
);
'''Sample Insert and Copy Statements'''
patentdata:
INSERT INTO patents_merged
====TESTING:====
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
--RESULT: 5411151
EXPECTED: 5426566
 
We found some copies of a few rows, where both the patent_2015 and patentdata
SELECT COUNT(*), *

Navigation menu