Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions admin_ui/admin/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
CHANGABLE_INLINES = (InProgressInline, InReviewInline, InAdminReviewInline)


@admin.register(models.UrlValidation)
@admin.register(models.Alias)
@admin.register(models.Image)
@admin.register(models.GcmdProject)
Expand Down
1 change: 1 addition & 0 deletions admin_ui/views/change.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ class ChangeUpdateView(mixins.ChangeModelFormMixin, UpdateView):
"Season": "season-list-draft",
"Website": "website-list-draft",
"WebsiteType": "website_type-list-draft",
"UrlValidation": "url_validation-list-draft",
"Repository": "repository-list-draft",
}

Expand Down
6 changes: 6 additions & 0 deletions cmr/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from celery import shared_task
from cmr.doi_matching import DoiMatcher
from data_models.website_analyzer import run_validator_and_store


@shared_task
def validate_websites_and_store():
return run_validator_and_store()


@shared_task
Expand Down
49 changes: 49 additions & 0 deletions data_models/migrations/0045_urlvalidation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Generated by Django 3.1.3 on 2022-03-03 22:12

import datetime
from django.db import migrations, models
import django.db.models.deletion
import uuid


class Migration(migrations.Migration):

dependencies = [
('contenttypes', '0002_remove_content_type_name'),
('data_models', '0044_auto_20220203_1500'),
]

operations = [
migrations.CreateModel(
name='UrlValidation',
fields=[
(
'uuid',
models.UUIDField(
default=uuid.uuid4,
editable=False,
primary_key=True,
serialize=False,
unique=True,
),
),
('url_object_id', models.UUIDField()),
('url_source_field', models.TextField()),
('url', models.URLField(max_length=1024)),
('last_validated', models.DateTimeField(blank=True, default=datetime.datetime.now)),
('is_active', models.BooleanField(blank=True, null=True)),
('details', models.TextField(blank=True, default='')),
(
'url_content_type',
models.ForeignKey(
blank=True,
on_delete=django.db.models.deletion.CASCADE,
to='contenttypes.contenttype',
),
),
],
options={
'verbose_name_plural': 'UrlValidations',
},
),
]
25 changes: 24 additions & 1 deletion data_models/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import uuid
import urllib.parse
import uuid
from datetime import datetime

from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation
from django.contrib.contenttypes.models import ContentType
Expand Down Expand Up @@ -58,6 +59,28 @@ def get_file_path(instance, path):
return f"{instance.uuid}.{ext}"


class UrlValidation(BaseModel):
url_content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE, blank=True)
url_object_id = models.UUIDField()
url_source_model = GenericForeignKey("url_content_type", "url_object_id")
url_source_field = models.TextField()

url = models.URLField(max_length=1024)
last_validated = models.DateTimeField(default=datetime.now, blank=True)
is_active = models.BooleanField(blank=True, null=True)
details = models.TextField(default="", blank=True)

class Meta:
verbose_name_plural = "UrlValidations"

def __str__(self):
return self.url

@property
def model_name(self):
return self.url_content_type.model_class


class Image(BaseModel):
image = models.ImageField(upload_to=get_file_path)
title = models.CharField(max_length=1024, default="", blank=True)
Expand Down
180 changes: 180 additions & 0 deletions data_models/website_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import re

import requests
from django.contrib.contenttypes.models import ContentType

from data_models import models

FIELDS_TO_VALIDATE = {
"PartnerOrg": {
"website",
},
"Image": {
"source_url",
},
"FocusArea": {
"url",
},
"Website": {
"url",
},
"Platform": {
"online_information",
},
"Instrument": {
"calibration_information",
"overview_publication",
"online_information",
},
"IOP": {
"published_list",
"reports",
},
"SignificantEvent": {
"published_list",
"reports",
},
"CollectionPeriod": {
"instrument_information_source",
},
}


def extract_urls(text_with_urls):
"""URLs might be written amongst other words in a body of text. This
function takes in a text string and returns a list of identified urls.

Args:
text (str): text string which may contain urls

Returns:
list: list of strings, where each string is a url
"""

# https://stackoverflow.com/a/48769624
url_regex = r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+'

return re.findall(url_regex, text_with_urls)


def add_url_scheme(url):
"""Some URLs in the database are lacking http(s) and cannot be accessed via the
requests module. This function detects if http is missing and adds it.

Args:
url (str): url string which may be missing 'http'

Returns:
str: url string that has http at the beginning
"""

if not url.startswith('http'):
url = 'http://' + url

return url


def validate_url(url):
"""URLs in the MI might no longer point to a valid webpage. This function
takes a URL and requests a status code from the webpage. If the request is
sucessful, the function retuns a True/False value depending on whether the
link was still active. If the function fails for some reason, the error is
returned instead.

Args:
url (str): url to be validated

Returns:
bool/text: bool if check was successful, otherwise an error code
"""

# providing a header is required to prevent some sites, like http://www.ipy.org/ from throwing a ConnectionError
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"}

try:
request = requests.head(url)
details = ""
valid = request.status_code != 404
except Exception as e:
details = f'validator encountered an error of: {e}'
valid = False

return {
'valid': valid,
'details': details,
}


def compile_urls_list(fields_to_search=FIELDS_TO_VALIDATE):
"""The MI contains many fields which may or may not contain one or more urls.
This function takes a dictionary of models and fields and extracts urls from
each field.

Args:
fields_to_search (dict, optional): dictionary of fields which may contain
urls. Defaults to FIELDS_TO_VALIDATE.

Returns:
list of dicts: list of dictionaries, [{uuid, model_name, field_name, url}, ..]
"""

urls_to_validate = []
for model_name, field_names in fields_to_search.items():
model = getattr(models, model_name)
objects = model.objects.all()

for object in objects:

for field_name in field_names:
potential_url_field = getattr(object, field_name)

for url in extract_urls(potential_url_field):
urls_to_validate.append(
{
'uuid': object.uuid,
'model_name': model_name,
'content_type': ContentType.objects.get_for_model(model),
'field_name': field_name,
'url': add_url_scheme(url),
}
)
return urls_to_validate


def validate_urls(url_list):
"""Takes a list of dictionaries, where the dictionary contains a url and the
source model/field/uuid and validates each entry. The original dictionary is
supplemented with the validation results.

Args:
url_list (list): list of urls output from compile_urls_list

Returns:
list[dict]: list of dictionaries, [{uuid, model_name, field_name, url, valid}, ..]
"""

for url_data in url_list:
validation_results = validate_url(url_data['url'])
url_data['valid'] = validation_results['valid']
url_data['details'] = validation_results['details']

return url_list


def run_validator_and_store():

url_list = compile_urls_list(FIELDS_TO_VALIDATE)
validation_data = validate_urls(url_list)

for url_data in validation_data:
url_validation = models.UrlValidation.objects.create(
url_content_type=url_data['content_type'],
url_object_id=url_data['uuid'],
url_source_field=url_data['field_name'],
url=url_data['url'],
is_active=url_data['valid'],
details=url_data['details'],
)
url_validation.save()

return validation_data