Skip to content
Merged
15 changes: 14 additions & 1 deletion scripts/2-process/gcs_process.py
Comment thread
TimidRobot marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -111,6 +118,7 @@ def process_product_totals(args, count_data):
data.items(), columns=["CC legal tool product", "Count"]
)
file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -192,7 +200,8 @@ def process_latest_prior_retired_totals(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
)
data_to_csv(args, dataframe, file_path)
check_for_data_file(file_path)
data_to_csv(args, dataframe, file_path)


def process_totals_by_free_cultural(args, count_data):
Expand Down Expand Up @@ -225,6 +234,7 @@ def process_totals_by_free_cultural(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -259,6 +269,7 @@ def process_totals_by_restrictions(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand All @@ -280,6 +291,7 @@ def process_totals_by_language(args, data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_language.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand All @@ -301,6 +313,7 @@ def process_totals_by_country(args, data):
file_path = shared.path_join(
PATHS["data_phase"], "gcs_totals_by_country.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down
277 changes: 277 additions & 0 deletions scripts/2-process/openverse_process.py
Comment thread
TimidRobot marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
#!/usr/bin/env python
"""
This file is dedicated to processing Openverse data
for analysis and comparison between quarters.
"""
# Standard library
import argparse
import csv
import os
import sys
import traceback
from collections import defaultdict

# Third-party
import pandas as pd

# Add parent directory so shared can be imported
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

# First-party/Local
import shared # noqa: E402

# Setup
LOGGER, PATHS = shared.setup(__file__)

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])


def parse_arguments():
"""
Parse command-line options, returns parsed argument namespace.
"""
LOGGER.info("Parsing command-line options")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--quarter",
default=QUARTER,
help=f"Data quarter in format YYYYQx (default: {QUARTER})",
)
parser.add_argument(
"--enable-save",
action="store_true",
help="Enable saving results (default: False)",
)
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
if args.quarter != QUARTER:
global PATHS
PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
args.logger = LOGGER
args.paths = PATHS
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
os.makedirs(PATHS["data_phase"], exist_ok=True)
# emulate csv.unix_dialect
data.to_csv(
file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
)


def process_totals_by_license(args, count_data):
"""
Processing count data: totals by license
"""
LOGGER.info(process_totals_by_license.__doc__.strip())
data = defaultdict(int)

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
count = int(row.MEDIA_COUNT)

data[tool] += count
data = pd.DataFrame(data.items(), columns=["License", "Count"])
data.sort_values("License", ascending=True, inplace=True)
data.reset_index(drop=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_media_type(args, count_data):
"""
Processing count data: totals by media type
"""

LOGGER.info(process_totals_by_media_type.__doc__.strip())
data = defaultdict(int)

for row in count_data.itertuples(index=False):
media_type = str(row.MEDIA_TYPE)
count = int(row.MEDIA_COUNT)

data[media_type] += count
data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
data.sort_values("Media_type", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_media_type.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_source(args, count_data):
"""
Processing count data: totals by source
"""
LOGGER.info(process_totals_by_source.__doc__.strip())
data = defaultdict(int)
for row in count_data.itertuples(index=False):
source = str(row.SOURCE)
count = int(row.MEDIA_COUNT)

data[source] += count
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
data.sort_values("Source", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_source.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_permissive_by_media_type(args, count_data):
"""
Processing count data: permissive by media type
"""
LOGGER.info(process_permissive_by_media_type.__doc__.strip())

data = defaultdict(int)

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
media_type = str(row.MEDIA_TYPE)
count = int(row.MEDIA_COUNT)

if tool in ["CC0", "CC BY", "CC BY-SA"]:
data[media_type] += count

data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
data.sort_values("Media_type", ascending=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "openverse_permissive_by_media_type.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_permissive_by_source(args, count_data):
"""
Processing count data: permissive content by source
"""
LOGGER.info(process_permissive_by_source.__doc__.strip())
data = defaultdict(int)
for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
source = str(row.SOURCE)
count = int(row.MEDIA_COUNT)
if tool in ["CC0", "CC BY", "CC BY-SA"]:
data[source] += count
data = pd.DataFrame(data.items(), columns=["Source", "Count"])
data.sort_values("Source", ascending=True, inplace=True)
file_path = shared.path_join(
PATHS["data_phase"], "openverse_permissive_by_source.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def process_totals_by_restriction(args, count_data):
"""
Processing count data: totals by restriction
"""
# https://creativecommons.org/public-domain/freeworks/
LOGGER.info(process_totals_by_restriction.__doc__.strip())

data = {
"Copyleft": 0,
"Permissive": 0,
"Public domain": 0,
"Restricted": 0,
}

for row in count_data.itertuples(index=False):
tool = str(row.TOOL_IDENTIFIER)
count = int(row.MEDIA_COUNT)

if tool in ["CC0", "PDM"]:
key = "Public domain"

elif tool in ["CC BY"]:
key = "Permissive"

elif tool in ["CC BY-SA"]:
key = "Copyleft"

else:
key = "Restricted"

data[key] += count

data = pd.DataFrame(data.items(), columns=["Category", "Count"])
data.sort_values("Category", ascending=True, inplace=True)

file_path = shared.path_join(
PATHS["data_phase"], "openverse_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv")
count_data = shared.open_data_file(
LOGGER,
file_count,
usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"],
)
process_totals_by_license(args, count_data)
process_totals_by_media_type(args, count_data)
process_totals_by_source(args, count_data)
process_permissive_by_media_type(args, count_data)
process_permissive_by_source(args, count_data)
process_totals_by_restriction(args, count_data)
# Push changes
args = shared.git_add_and_commit(
args,
PATHS["repo"],
PATHS["data_quarter"],
f"Add and commit new GitHub data for {QUARTER}",
)
shared.git_push_changes(args, PATHS["repo"])


if __name__ == "__main__":
try:
main()
except shared.QuantifyingException as e:
if e.exit_code == 0:
LOGGER.info(e.message)
else:
LOGGER.error(e.message)
sys.exit(e.code)
except SystemExit as e:
LOGGER.error(f"System exit with code: {e.code}")
sys.exit(e.code)
except KeyboardInterrupt:
LOGGER.info("(130) Halted via KeyboardInterrupt.")
sys.exit(130)
except Exception:
LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
sys.exit(1)
3 changes: 2 additions & 1 deletion scripts/3-report/gcs_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -27,7 +28,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Google Custom Search (GCS)"
SECTION = Path(__file__).name
Comment thread
TimidRobot marked this conversation as resolved.
Outdated


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/github_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "GitHub data"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
Loading