Changes

Jump to navigation Jump to search
no edit summary
*Company ve primary industry sub group 2
*Company ve primary industry sub group 3
 
 
==Tables built up to 02/07/17==
 
SDC pull #1
company founding date
company ipo date
date company received last investment
date company received first investment
company staet
company name
company industry class
company industry major group
company industry subgroup 1
company industry subgroup 3
Company industry minor group
company cusip
ttoal known amt invested in company
no rouds company received investment
company ipo status
company website
 
SDC pull #2
company name
executive's name prefix
executive's first name
executive's last name
executive's job title
 
 
 
logging on:
researcher@128.42.44.181
password: 9million
 
cd /bulk
cd \Women2017
psql Women2017
 
DROP TABLE pull3info;
CREATE TABLE pull3info(
/*taken from pull3.txt*/
ipodate date,
lastinvestdate date,
firstinvestdate date,
companyfoundingdate date,
ipostatus varchar(10),
companystate varchar(50),
companyname varchar(200),
industryclass varchar(200),
majorgroup varchar(200),
minorgroup varchar(200),
subgroup1 varchar(200),
subgroup3 varchar(200),
cusip varchar(6),
totalinvested float,
numrounds int,
website varchar(200)
);
 
\COPY pull3info FROM 'pull3-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 
--COPY 43606
 
DROP TABLE cleancosbase;
CREATE TABLE cleancosbase AS
SELECT companyname, min(firstinvestdate) as minfirstdate FROM pull3info WHERE companyname !='Undisclosed Company' OR companyname !='New York Digital Health LLC' GROUP BY companyname;
--43534
 
DROP TABLE cleancos;
CREATE TABLE cleancos AS
SELECT pull3info.*
FROM pull3info
JOIN Cleancosbase ON pull3info.companyname=cleancosbase.companyname AND minfirstdate=firstinvestdate;
--43534
 
 
DROP TABLE pull4info;
CREATE TABLE pull4info(
/*taken from pull4.txt*/
execfirstname varchar(50),
jobtitle varchar(100),
execlastname varchar(50),
execprefix varchar(5),
companyname varchar(200)
);
 
\COPY pull4info FROM 'pull4-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 
/*the job title column of the exec is for some reason combined with the first name
even after normalizing the data. I will repull the data from SDC to see if it was
simply an error in the pull or something. If that doesn't work, will go to ed.
*/
 
SDC pull #5
executive's name prefix
executive's first name
executive's last name
executive's job title
company name
 
DROP TABLE pull5info;
CREATE TABLE pull5info(
/*taken from pull5.txt*/
execprefix varchar(5),
execfirstname varchar(50),
execlastname varchar(50),
jobtitle varchar(100),
companyname varchar(200)
);
 
\COPY pull5info FROM 'pull5-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 
--COPY 186181
 
DROP TABLE distinctnocompanies;
CREATE TABLE distinctcompanies AS
SELECT DISTINCT companyname FROM pull3info;
/*created this per ed's advice to see how many companies are distinct from each other
so we dont get duplicates in the dataset - only really creating a table to get the
specific number*/
 
--COPY 43536
/*this means there are 70 duplicates in the data. could mean that some companies in diff
states have the same name, or it could be a for real duplicate. can go back and check
on this later but rn seems like a relatively nominal amt.*/
 
 
DROP TABLE totaltable;
CREATE TABLE totaltable AS
SELECT * FROM pull5info LEFT JOIN pull3info ON pull5info.CompanyName = pull3info.companyname;
/*represents the table combining pull3info and pull5info, aka all of the info about
the companies and the executives' info matched on company name*/
 
/*WHERE IM AT:
 
START GROUPING BY WOMEN IN pull3info
ed says to rename pull3info but idc like i know what it is
 
*/
 
DROP TABLE commonnames;
CREATE TABLE commonnames(
/*common male and female names with corresponding 0 or 1*/
name varchar(100),
mf int
);
 
DROP TABLE commonnamesclean;
CREATE TABLE commonnamesclean AS
SELECT name, sum(mf) AS mf FROM commonnames GROUP BY name HAVING count(mf) = 1;
 
\COPY commonnames FROM 'realcommonnames.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--4654
 
DROP TABLE impoppl;
CREATE TABLE impoppl AS
/*selecting the people with important job titles*/
SELECT * FROM pull5info WHERE jobtitle IN ('Board Member', 'CEO', 'Founder', 'CFO',
'COO', 'Chairman', 'Chief Executive Officer', 'Chief Financial Officer', 'President',
'Treasurer', 'Vice President');
--67733
 
DROP TABLE binarymanwoman;
CREATE TABLE binarymanwoman AS
/*creates table with a column populated with 1 if prefix is Ms or Mrs and 0 if mr*/
SELECT*,
CASE WHEN execprefix='Ms' THEN 1::int
WHEN execprefix='Mrs' THEN 1::int
WHEN execprefix='Mr' THEN 0::int
ELSE Null::int END AS binaryval FROM pull5info;
--186181
 
DROP TABLE binarycommonmatch;
CREATE TABLE binarycommonmatch AS
/*matching common names to list of execs who have dr prefix/are gender unidentified*/
SELECT execprefix, execfirstname, execlastname, jobtitle, companyname,
CASE WHEN binaryval IS NULL AND mf IS NOT NULL THEN mf
WHEN binaryval IS NOT NULL THEN binaryval
ELSE NULL::int END AS binaryval
FROM binarymanwoman LEFT JOIN commonnamesclean ON execfirstname = name
WHERE execfirstname IS NOT NULL;
--171307
 
DROP TABLE doctors;
CREATE TABLE doctors AS
/*represents the leaders from companies who haven't been assigned a gender*/
SELECT * FROM binarycommonmatch WHERE binaryval IS NULL;
--7016
 
 
DROP TABLE women;
CREATE TABLE women AS
SELECT * FROM binarycommonmatch WHERE binaryval=1;
--18101
/*this means that 18101/171307 of the people in "important" positions are women
which is equal to 0.105. Could be +/- 0.04 bc of the doctors/unidentified genders
in the data*/
 
/*next steps: redo SDC pull, group by
 
state
city
company
exec prefix, first name, last name, job title
 
also check in w ed about the no of doctos and whether getting within a 4% margin is
ok*/
 
SDC pull #6
 
company state
company city
company name
exec first name
exec last name
exec prefix
exec job title
 
DROP TABLE execbylocation;
CREATE TABLE execbylocation(
state varchar(75),
city varchar(75),
companyname varchar(100),
execjob varchar(100),
execprefix varchar(5),
firstname varchar(50),
lastname varchar(50)
);
 
\COPY execbylocation FROM 'pull6-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--186242
 
 
DROP TABLE impopplbylocation;
CREATE TABLE impopplbylocation AS
/*selecting the people with important job titles*/
SELECT * FROM execbylocation WHERE execjob IN ('Board Member', 'CEO', 'Founder', 'CFO',
'COO', 'Chairman', 'Chief Executive Officer', 'Chief Financial Officer', 'President',
'Treasurer', 'Vice President');
--67750
 
DROP TABLE impoppllocname;
CREATE TABLE impoppllocname AS
SELECT * FROM impopplbylocation WHERE execprefix IN ('Ms', 'Mrs');
--5646
 
/*here's what I want to do: I have a table with all the (identifiable) women, and a
separate table with both men and women. I want to group by state in both, then
divide the no of females in state (impoppllocname)/total in state (impopplbylocation)
where state=state (i.e. females in california in impoppllocname divided by
total no of ppl from california in impopplbylocation) to get the percentage of females
in each state with "important" positions. however, sql is a turd and won't let me group
by state for some odd reason in either of these tables. is there a more efficient
way to do this than with two tables? perhaps. But idk! help!*/
 
DROP TABLE binpplbylocation;
CREATE TABLE binpplbylocation AS
/*creates table with a column populated with 1 if prefix is Ms or Mrs and 0 if mr*/
SELECT*,
CASE WHEN execprefix='Ms' THEN 1::float
WHEN execprefix='Mrs' THEN 1::float
WHEN execprefix='Mr' THEN 0::float
ELSE Null::int END AS binaryval FROM impopplbylocation;
--67750
 
DROP TABLE nowomenstate;
CREATE TABLE nowomenstate AS
SELECT binpplbylocation.state, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbylocation GROUP BY binpplbylocation.state
ORDER BY percentage ASC;
/*this gives the no of women in each state/all the ppl in a state to yield the percentage
of women in each state who are females in power*/
--54 (includes Guam, D.C., etc.)
/*Wyoming the lowest, at 0, and Hawaii the highest, at 16.9%, Texas at 8.1%*/
 
DROP TABLE nowomencity;
CREATE TABLE nowomencity AS
SELECT binpplbylocation.city, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbylocation GROUP BY binpplbylocation.city
ORDER BY percentage DESC;
--2049
 
SELECT * FROM nowomencity WHERE city='San Antonio';
--Houston has 7.8% of women in leadership positions in all of its companies
--Austin has 9.1%
--Dallas has 9.9%
--San Antonio has 7.4%
 
/*next do sdc pull to group by industry minor group 3, industry major group*/
 
SDC pull #7
 
Custom Report
executive's name prefix
executive's first name
executive's last name
executive's job title
company name
Company major group
company minor group
company sub group 1,2,3
 
perl NormalizeFixedWidth.pl -file="pull7.txt"
 
DROP TABLE execbyindustry;
CREATE TABLE execbyindustry(
execprefix varchar(5),
firstname varchar(50),
lastname varchar(50),
jobtitle varchar(50),
companyname varchar(100),
majorgroup varchar(100),
minorgroup varchar(100),
subgroup1 varchar(100),
subgroup2 varchar(100),
subgroup3 varchar(100)
);
 
\COPY execbyindustry FROM 'pull7-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
--186272
 
DROP TABLE impopplbylocation2;
CREATE TABLE impopplbylocation2 AS
/*selecting the people with important job titles*/
SELECT * FROM execbyindustry WHERE jobtitle IN ('Board Member', 'CEO', 'Founder', 'CFO',
'COO', 'Chairman', 'Chief Executive Officer', 'Chief Financial Officer', 'President',
'Treasurer', 'Vice President');
--67764
 
DROP TABLE binpplbyindustry;
CREATE TABLE binpplbyindustry AS
/*creates table with a column populated with 1 if prefix is Ms or Mrs and 0 if mr*/
SELECT*,
CASE WHEN execprefix='Ms' THEN 1::float
WHEN execprefix='Mrs' THEN 1::float
WHEN execprefix='Mr' THEN 0::float
ELSE Null::int END AS binaryval FROM impopplbylocation2;
--67764
 
DROP TABLE nowomenmajorind;
CREATE TABLE nowomenmajorind AS
SELECT binpplbyindustry.majorgroup, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbyindustry GROUP BY binpplbyindustry.majorgroup
ORDER BY percentage DESC;
--6
/*highest is biotechnology, with 11% of company founders being women. lowest is
semiconductors/other electric, with 4%*/
 
DROP TABLE nowomenminorind;
CREATE TABLE nowomenminorind AS
SELECT binpplbyindustry.minorgroup, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbyindustry GROUP BY binpplbyindustry.minorgroup
ORDER BY percentage DESC;
--10
/*consumer related higher than biotechnology at 11.9% vs 11.5%, with lowest still
semiconductors/other electric at 4%)*/
 
DROP TABLE nowomensub1;
CREATE TABLE nowomensub1 AS
SELECT binpplbyindustry.subgroup1, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbyindustry GROUP BY binpplbyindustry.subgroup1
ORDER BY percentage DESC;
--18
/*pretty similar to minorind; not much difference/nothing revolutionary to add*/
 
DROP TABLE nowomensub2;
CREATE TABLE nowomensub2 AS
SELECT binpplbyindustry.subgroup2, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbyindustry GROUP BY binpplbyindustry.subgroup2
ORDER BY percentage DESC;
--69
/*consumer products = 15%, consumer services = 14%, energy enhanced recovery = 14%,
med/health services 13%, then biotech research at 13%. Lowest: industrial services at
2.5%, idustrial equipment at 2.9%, power supplies at 3.1%, laser related at 3.3%,
optoelectronics at 3.6%*/
 
DROP TABLE nowomensub3;
CREATE TABLE nowomensub3 AS
SELECT binpplbyindustry.subgroup3, sum(binaryval)/count(binaryval) AS percentage
FROM binpplbyindustry GROUP BY binpplbyindustry.subgroup3
ORDER BY percentage ASC;
--560
/*industrial products=50%!, agriculture=33%, ecommerce selling products = 29%,
biotech food enzymes and cultures = 29%, computers and hardware=28%. lowest: there are
straight up 0 in about 75 different things.*/
 
 
/*up next: build table with coname, women ceo&founder (binary 0/1), women in power (0
or 1), #power positions, #women in power, state, year co received first investment,
no rounds, total$invested*/
 
 
=Spring 2016=

Navigation menu