Commit initial

main
usretc 3 years ago
parent 9e26ef4202
commit 45a9729855

2
.gitignore vendored

@ -1,3 +1,5 @@
#Directory (KDE)
.directory
# ---> Python # ---> Python
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

@ -1,3 +1,13 @@
# marseille-perils-webcrawler # Environnement virtuel pour python
python3 -m venv scrapy-env
(unix-like) source scrapy-env/bin/activate
(windows) tutorial-env\Scripts\activate.bat
Un webcrawler basé sur Scrapy pour extraire les arrêtés de péril du site de la ville de Marseille # Installation des dépendances
python -m pip install --upgrade pip
python -m pip install regex
python -m pip install scrapy
# Lancement de scrapy
cd src
scrapy crawl perils -O perils.csv

Binary file not shown.

@ -0,0 +1,247 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"

@ -0,0 +1,69 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV="/home/p/Documents/Mapping/scraping/perils/scrapy-env"
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/bin:$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1="(scrapy-env) ${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT="(scrapy-env) "
export VIRTUAL_ENV_PROMPT
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi

@ -0,0 +1,26 @@
# This file must be used with "source bin/activate.csh" *from csh*.
# You cannot run it directly.
# Created by Davide Di Blasi <davidedb@gmail.com>.
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
# Unset irrelevant variables.
deactivate nondestructive
setenv VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
set _OLD_VIRTUAL_PATH="$PATH"
setenv PATH "$VIRTUAL_ENV/bin:$PATH"
set _OLD_VIRTUAL_PROMPT="$prompt"
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
set prompt = "(scrapy-env) $prompt"
setenv VIRTUAL_ENV_PROMPT "(scrapy-env) "
endif
alias pydoc python -m pydoc
rehash

@ -0,0 +1,66 @@
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
# (https://fishshell.com/); you cannot run it directly.
function deactivate -d "Exit virtual environment and return to normal shell environment"
# reset old environment variables
if test -n "$_OLD_VIRTUAL_PATH"
set -gx PATH $_OLD_VIRTUAL_PATH
set -e _OLD_VIRTUAL_PATH
end
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
set -e _OLD_VIRTUAL_PYTHONHOME
end
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
functions -e fish_prompt
set -e _OLD_FISH_PROMPT_OVERRIDE
functions -c _old_fish_prompt fish_prompt
functions -e _old_fish_prompt
end
set -e VIRTUAL_ENV
set -e VIRTUAL_ENV_PROMPT
if test "$argv[1]" != "nondestructive"
# Self-destruct!
functions -e deactivate
end
end
# Unset irrelevant variables.
deactivate nondestructive
set -gx VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
set -gx _OLD_VIRTUAL_PATH $PATH
set -gx PATH "$VIRTUAL_ENV/bin" $PATH
# Unset PYTHONHOME if set.
if set -q PYTHONHOME
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
set -e PYTHONHOME
end
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
# fish uses a function instead of an env var to generate the prompt.
# Save the current fish_prompt function as the function _old_fish_prompt.
functions -c fish_prompt _old_fish_prompt
# With the original prompt function renamed, we can override with our own.
function fish_prompt
# Save the return status of the last command.
set -l old_status $status
# Output the venv prompt; color taken from the blue of the Python logo.
printf "%s%s%s" (set_color 4B8BBE) "(scrapy-env) " (set_color normal)
# Restore the return status of the previous command.
echo "exit $old_status" | .
# Output the original/"old" prompt.
_old_fish_prompt
end
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
set -gx VIRTUAL_ENV_PROMPT "(scrapy-env) "
end

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from automat._visualize import tool
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(tool())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.conch.scripts.cftp import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.conch.scripts.ckeygen import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.conch.scripts.conch import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,54 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
import sys
import json
import argparse
from pprint import pformat
import jmespath
from jmespath import exceptions
def main():
parser = argparse.ArgumentParser()
parser.add_argument('expression')
parser.add_argument('-f', '--filename',
help=('The filename containing the input data. '
'If a filename is not given then data is '
'read from stdin.'))
parser.add_argument('--ast', action='store_true',
help=('Pretty print the AST, do not search the data.'))
args = parser.parse_args()
expression = args.expression
if args.ast:
# Only print the AST
expression = jmespath.compile(args.expression)
sys.stdout.write(pformat(expression.parsed))
sys.stdout.write('\n')
return 0
if args.filename:
with open(args.filename, 'r') as f:
data = json.load(f)
else:
data = sys.stdin.read()
data = json.loads(data)
try:
sys.stdout.write(json.dumps(
jmespath.search(expression, data), indent=4, ensure_ascii=False))
sys.stdout.write('\n')
except exceptions.ArityError as e:
sys.stderr.write("invalid-arity: %s\n" % e)
return 1
except exceptions.JMESPathTypeError as e:
sys.stderr.write("invalid-type: %s\n" % e)
return 1
except exceptions.UnknownFunctionError as e:
sys.stderr.write("unknown-function: %s\n" % e)
return 1
except exceptions.ParseError as e:
sys.stderr.write("syntax-error: %s\n" % e)
return 1
if __name__ == '__main__':
sys.exit(main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.mail.scripts.mailmail import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer.cli.normalizer import cli_detect
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli_detect())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.scripts.htmlizer import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1 @@
/usr/bin/python3

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(execute())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.conch.scripts.tkconch import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from tldextract.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.scripts.trial import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.application.twist._twist import Twist
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(Twist.main())

@ -0,0 +1,8 @@
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from twisted.scripts.twistd import run
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(run())

@ -0,0 +1,3 @@
home = /usr/bin
include-system-site-packages = false
version = 3.10.8

File diff suppressed because it is too large Load Diff

@ -0,0 +1,33 @@
import scrapy
# import regex
# from scrapy.loader import ItemLoader
# from itemloaders.processors import TakeFirst, MapCompose
# from w3lib.html import remove_tags
# # Une fonction qui sépare les adresses en fonction des noms de rue
# def splitStreet(value):
# # Le regex qui évalue là où il faut séparer la chaîne de caractère
# # (présence de "et / - + ainsi que"
# # sans que ce séparateur ne soit placé près d'un groupe de numéros de la même rue)
# expr = r"((?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\/|((?<!(\d))\s\bet)|(ainsi\sque)|((?<!(\d))\s-\s(?!(bt)|(bis)|(ter)))|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\+)"
# # Remplacement de chaque séparateur par un pipe ("|")
# subst = "|"
# repl = regex.sub(expr, subst, value, 0, regex.MULTILINE | regex.IGNORECASE)
# # Formattage (supression des espaces insécables et des ":")
# filtered = regex.sub('\\xa0|:', '', repl, 0, regex.MULTILINE | regex.IGNORECASE)
# # Séparation en liste
# splitted = regex.split('\|', filtered, regex.MULTILINE | regex.IGNORECASE)
# # Suppression des espaces en début et fin de chaîne
# stripped = [x.strip() for x in splitted]
# # Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
# result = list(filter(None, stripped))
# return result
class PerilsItem(scrapy.Item):
adrs = scrapy.Field()
dernierA = scrapy.Field()
As = scrapy.Field()
raw = scrapy.Field()
pass

@ -0,0 +1,141 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
# from copy import deepcopy
# import regex
# class SplitByStreet:
# @classmethod
# # Une fonction qui sépare les adresses en fonction des noms de rue
# def splitStreet(self, value):
# # Le regex qui évalue là où il faut séparer la chaîne de caractère
# # (présence de "et / - + ainsi que"
# # sans que ce séparateur ne soit placé près d'un groupe de numéros de la même rue)
# expr = r"((?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\/|((?<!(\d))\s\bet)|(ainsi\sque)|((?<!(\d))\s-\s(?!(bt)|(bis)|(ter)))|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\+)"
# # Remplacement de chaque séparateur par un pipe ("|")
# subst = "|"
# repl = regex.sub(expr, subst, value, 0, regex.MULTILINE | regex.IGNORECASE)
# # Formattage (supression des espaces insécables et des ":")
# filtered = regex.sub('\\xa0|:', '', repl, 0, regex.MULTILINE | regex.IGNORECASE)
# # Séparation en liste
# splitted = regex.split('\|', filtered, regex.MULTILINE | regex.IGNORECASE)
# # Suppression des espaces en début et fin de chaîne
# stripped = [x.strip() for x in splitted]
# # Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
# value = list(filter(None, stripped))
# return value
# def process_spider_output(self, response, result, spider):
# for r in result:
# adresses = r.pop("adrs")
# adresses = adresses[0]
# indiv = self.splitStreet(adresses)
# index = 0
# for i in indiv:
# d = {"adrs":indiv[index]}
# index += 1
# yield d
class PerilsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class PerilsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

@ -0,0 +1,13 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PerilsPipeline:
def process_item(self, item, spider):
return item

@ -0,0 +1,93 @@
# Scrapy settings for perils project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'perils'
SPIDER_MODULES = ['perils.spiders']
NEWSPIDER_MODULE = 'perils.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'perils (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'perils.middlewares.PerilsSpiderMiddleware': 543,
'perils.splittermidware.SplitAndSort': 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'perils.middlewares.PerilsDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'perils.pipelines.PerilsPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,30 @@
import scrapy
from perils.items import PerilsItem
# from scrapy.loader import ItemLoader
class ScrapePerils(scrapy.Spider):
name = "perils"
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
def parse(self, response):
for adresses in response.xpath('//div[@class="card"]//li|//div[@class="card"]//li/p'):
item = PerilsItem()
# l = ItemLoader(item = PerilsItem(), selector=adresses)
# l.add_xpath('adrs', './text()')
# l.add_xpath('dernierA', './a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()')
# if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
# l.add_xpath('dernierA', './a[last()]/text()')
item['adrs'] = adresses.xpath('./text()').get(),
item['dernierA'] = adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
item['dernierA'] = adresses.xpath('./a[last()]/text()').get(),
item['As'] = adresses.xpath('./a/text()').getall()
item['raw'] = adresses.xpath('.').get()
yield item
#response.xpath('//div[@class="card"]//li/text()[1]|//div[@class="card"]//li/p/text()[1]').getall()

@ -0,0 +1,13 @@
import scrapy
class ScrapePerils(scrapy.Spider):
name = "perils"
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
def parse(self, response):
for adresses in response.xpath('//div[@class="card"]//li'):
yield {
'adresse': adresses.xpath('./text()').get(),
'dernier arrêté hors modificatif' : adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
'dernier arrêté' : adresses.xpath('./a[last()]/text()').get(),
}

@ -0,0 +1,211 @@
# Dépendances
from scrapy import signals
# Python a une fonctionnalité regex native (re.py) mais elle a quelques limitations qui dans le cas présent rendent les regex inopérants
# On importe donc regex.py (pip install regex) qui étend les capacités de re.py (notamment la tolérance aux erreurs)
import regex
class SplitAndSort:
@classmethod
# Une fonction qui sépare les adresses selon les noms de rue
def splitStreet(self, value):
# Le regex qui évalue là où il faut séparer la chaîne de caractère
# (présence de "et / - + ainsi que" )
# (sans que ce séparateur ne soit placé près d'un groupe de numéros de la même rue)
expr = r"((?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))(|\bet)|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\/|(ainsi\sque)|((?<!(\d))\s-\s(?!(bt)|(bis)|(ter)))|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\+)"
# Remplacement de chaque séparateur par un pipe ("|")
subst = "|"
repl = regex.sub(expr, subst, value, 0, regex.MULTILINE | regex.IGNORECASE)
# Formattage (supression des espaces insécables et des ":")
filtered = regex.sub('\\xa0|:', '', repl, 0, regex.MULTILINE | regex.IGNORECASE)
# Séparation en liste
splitted = regex.split('\|', filtered, regex.MULTILINE | regex.IGNORECASE)
# Suppression des espaces en début et fin de chaîne
stripped = [x.strip() for x in splitted]
# Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
result = list(filter(None, stripped))
return result
# Une fonction qui valide tout ce qui est un couple n°/nom de rue
def isWorth(self, value):
# Le regex qui évalue ce qu'il faut garder
expr = r"(?=.*[a-z])(?=.*\d+).*"
# Un regex contre lequel comparer avant pour supprimer les exceptions
# (cas d'usage : "jardin public du 19 mars 1962" est reconnu comme un couple n°/nom de rue)
# ce n'est pas très sexy, il vaudrait mieux un regex plus strict,
# mais il serait trop complexe pour que ça en vaille la peine pour de rares exceptions
# (possibilité d'en ajouter d'autres ici par la suite si besoin)
xcp = r"(jardin public)|(Bâtiment 12)"
# Le résultat (booléen) si le regex match ou non, après comparaison avec les exceptions
if bool(regex.findall(xcp, value, regex.MULTILINE | regex.IGNORECASE)) is False :
result =bool(regex.findall(expr, value, regex.MULTILINE | regex.IGNORECASE))
else :
result = False
return result
# Une fonction qui enlève tous les trucs peu utiles ou encombrants des adresses
# ("immeuble", infos en parenthèses, infos sur l'arrêté, etc)
def removeClutter(self,value):
# Le regex qui évalue ce qu'il faut retirer
expr = r"\(.*\)|Immeuble|Chapelle,|Arrêté.*|\((.*){0,3}|"
# Le résultat après retrait
result = regex.sub(expr, '', value, regex.IGNORECASE | regex.MULTILINE)
stripped = result.strip()
return stripped
# Une fonction qui sépare les numéros des adresses
def separateNbr(self,value):
# Le regex qui évalue ce qui est un numéro ou un groupe de numéros
expr = r"(((\d).*(\da\b))(?=\s?\S?.{8,})|((\d).*(\db\b))(?=\s?\S?.{8,})|((\d).*(\dt\b))(?=\s?\S?.{8,})|((\d).*(\dbis\b))(?=\s?\S?.{8,})|((\d).*(\dter\b))(?=\s?\S?.{8,})|((\d).*(\d))(?=\s?\S?.{8,})|(\da\b)(?=\s?\S?.{8,})|(\db\b)(?=\s?\S?.{8,})|(\dbis\b)(?=\s?\S?.{8,})|(\dt\b)(?=\s?\S?.{8,})|(\dter\b)(?=\s?\S?.{8,})|(\d)(?=\s?\S?.{8,}))"
# On extrait le groupe numérique et on le place dans la variable "nbr"
sep = regex.search(expr, value, regex.IGNORECASE | regex.MULTILINE)
nbr = sep[0]
# On supprime le texte de la variable "nbr" de l'adresse (et on enlève les espaces en trop)
invexpr = regex.compile(nbr)
name = regex.sub(invexpr, '', value, regex.IGNORECASE | regex.MULTILINE)
name = name.strip()
# On retourne une liste avec le groupe numérique d'un côté, le nom de rue de l'autre
return (nbr,name)
# Une fonction qui sépare les numéros (hors rangées)
def splitNumber(self, value):
# Le regex qui évalue là où il faut séparer les numéros
expr = r"((?<=\d)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))|((?<=\da)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))|((?<=\db)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))|((?<=\dt)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))|((?<=\dbis)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))|((?<=\dter)\s?(&|et|_|-|\/|\+|,)\s?(?=\d))"
# Remplacement de chaque séparateur par un pipe ("|")
repl = regex.sub(expr, "|", value, regex.MULTILINE | regex.IGNORECASE)
# Séparation en liste
splitted = regex.split('\|', repl, regex.MULTILINE | regex.IGNORECASE)
# Suppression des espaces en début et fin de chaîne
stripped = [x.strip() for x in splitted]
# Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
value = list(filter(None, stripped))
return value
#Une fonction qui interpole les numéros manquants à partir des rangées
def splitRanges(self,value):
# Le regex qui évalue là où il faut séparer les numéros
expr = r"((?<=\d)\s?(au|à)\s?(?=\d))|((?<=\da)\s?(au|à)\s?(?=\d))|((?<=\db)\s?(au|à)\s?(?=\d))|((?<=\dt)\s?(au|à)\s?(?=\d))|((?<=\dbis)\s?(au|à)\s?(?=\d))|((?<=\dter)\s?(au|à)\s?(?=\d))"
# Remplacement de chaque séparateur par un pipe ("|")
repl = regex.sub(expr, "|", value, regex.MULTILINE | regex.IGNORECASE)
# Séparation en liste
splitted = regex.split('\|', repl, regex.MULTILINE | regex.IGNORECASE)
# Suppression des espaces en début et fin de chaîne
stripped = [x.strip() for x in splitted]
# Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
listed = list(filter(None, stripped))
# Si c'est effectivement une rangée :
if len(listed) > 1 :
# On initie une liste vide et un index correspondant au premier numéro de la rangée
rawList = []
index = int(listed[0])
# Tant que l'index n'est pas supérieur au dernier numéro de la rangée...
while index <= int(listed[1]):
# Inscrire l'index dans la liste
rawList.append(index)
# augmenter l'index d'un numéro
index += 1
# Un petit check pour supprimer les numéros pairs ou impairs selon la parité du premier numéro.
if int(listed[0])%2 == 0 :
for i in rawList :
if i%2 != 0 :
rawList.remove(i)
if int(listed[0])%2 != 0 :
for i in rawList :
if i%2 == 0 :
rawList.remove(i)
return rawList
else :
return value
def removeAbrog(self,value):
expr = r"(?:((main\s?-?levée)|(abrog))){i<=2,d<=2,e<=3}"
part = r"(?:(partiel)){i<=2,d<=2,e<=3}"
if bool(regex.findall(expr, value, regex.MULTILINE | regex.IGNORECASE)):
if bool(regex.findall(part, value, regex.MULTILINE | regex.IGNORECASE)):
return 1 # partiellement abrogé
else :
return 2 # abrogé
else :
return 0
def typeOf(self,value):
# mise en sécurité
peril = r"(?:((?<!périmètre)(sécurité|péril))){i<=2,d<=2,e<=2}"
# périmètre de sécurité
perimetre = r"(?:(périmètre)){i<=2,d<=2,e<=3}"
# imminent
urgent = r"(?:(urgent|imminent|grave)){i<=2,d<=2,e<=3}"
# interdiction d'occupation
interdi = r"(?:((?<=(occup)).*(interdi)|(?<=(util)).*(interdi)|(interdi).*(?=(occup))|(interdi).*(?=(util)))){i<=2,d<=2,e<=3}"
# déconstruction
deconstr = r"(?:(déconstr)){i<=2,d<=2,e<=3}"
# astreinte administrative
astr = r"(?:(astreinte)){i<=2,d<=2,e<=3}"
if bool(regex.findall(perimetre, value, regex.MULTILINE | regex.IGNORECASE)):
return "Périmètre de sécurité"
elif bool(regex.findall(interdi, value, regex.MULTILINE | regex.IGNORECASE)):
return "Interdiction d'occupation"
elif bool(regex.findall(deconstr, value, regex.MULTILINE | regex.IGNORECASE)):
return "Déconstruction"
elif bool(regex.findall(astr, value, regex.MULTILINE | regex.IGNORECASE)):
return "Astreinte administrative"
elif bool(regex.findall(peril, value, regex.MULTILINE | regex.IGNORECASE)):
if bool(regex.findall(urgent, value, regex.MULTILINE | regex.IGNORECASE)):
return "Péril imminent"
else :
return "Péril"
else :
return ""
# La fonction principale de traitement du résultat de la requête de scrapy avant yield
def process_spider_output(self, response, result, spider):
# Pour chaque résultat individuel dans le résultat de la requête :
for r in result:
# Extraire le texte brut de l'adresse et le stocker dans la variable "adresses"
adresses = r.pop('adrs')
# Extraire le dernier arrêté et le stocker dans la variable "dernA"
dernA = r.pop('dernierA')
As = r.pop('As')
raw = r.pop('raw')
# Ne prendre que le premier index de la liste
# il n'y en a qu'un de toute manière, c'est pour extraire le texte de l'objet car regex.py ne sait pas
# traiter les objets tuple et scrapy retourne un objet tuple, pas seulement une chaîne de caractères
adresses = adresses[0]
dernA = dernA[0]
# Si l'adresse est non-nulle :
# (pour éviter que regex.py ne plante à cause d'un objet de type None au lieu d'une chaîne de caractères)
# (ça supprime par la même occasion les arrêtés sans adresse, à date il n'y en a qu'un, sur un passage privé)
if dernA :
abrog = self.removeAbrog(dernA)
if abrog == 1:
typeof = "(Mainlevée partielle) "+ self.typeOf(dernA)
elif abrog == 2:
typeof = "Mainlevée"
else :
typeof = self.typeOf(dernA)
if adresses :
# On apelle la fonction splitStreet pour séparer les rues dans la variable "adresses" et les inscrire dans la liste "indiv"
indiv = self.splitStreet(adresses)
# Pour chaque adresse de la liste d'adresses "indiv" précemment obtenue :
for i in indiv:
# Si l'adresse est utile
if self.isWorth(i) :
iClean = self.removeClutter(i)
iSep = self.separateNbr(iClean)
iNbrs = self.splitNumber(iSep[0])
for r in iNbrs:
x = self.splitRanges(r)
print(x)
if isinstance(x, list):
for n in x:
yield {'':n,'Nom de rue':iSep[1],'Statut':typeof,'Dernier arrêté (hors modificatif)':dernA,'Arrêtés':As,'Données brutes':raw,"QGIS-RAW":str(n)+" "+iSep[1],"QGIS-City":"Marseille","QGIS-Country":"France"}
else :
yield {'':r,'Nom de rue':iSep[1],'Statut':typeof,'Dernier arrêté (hors modificatif)':dernA,'Arrêtés':As,'Données brutes':raw,"QGIS-RAW":str(r)+" "+iSep[1],"QGIS-City":"Marseille","QGIS-Country":"France"}

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = perils.settings
[deploy]
#url = http://localhost:6800/
project = perils
Loading…
Cancel
Save